* @license http://www.contenido.org/license/LIZENZ.txt * @link http://www.4fb.de * @link http://www.contenido.org * @since file available since contenido release <= 4.6 * * {@internal * created 2004-01-15 * modified 2008-06-30, Frederic Schneider, add security fix * modified 2008-07-11, Dominik Ziegler, marked class search_helper as deprecated * modified 2008-11-12, Andreas Lindner, add special treatment for iso-8859-2 * modified 2011-02-08, Murat Purc, removed PHP 4.3 related code, cleanup and formatting, created SearchBaseAbstract class * * $Id$: * }} * */ if (!defined('CON_FRAMEWORK')) { die('Illegal call'); } /** * Abstract base search class. Provides general properties and functions * for child implementations. * * @author Murat Purc */ abstract class SearchBaseAbstract { /** * Contenido database object * @var DB_ConLite */ protected $oDB; /** * Contenido configuration data * @var array */ protected $cfg; /** * Language id of a client * @var int */ protected $lang; /** * Client id * @var int */ protected $client; /** * Flag to enable debug * @var bool */ protected $bDebug; /** * Initialises some properties * * @param DB_ConLite $oDB Optional database instance * @param bool $bDebug Optional, flag to enable debugging */ protected function __construct($oDB = null, $bDebug = false) { global $cfg, $lang, $client; $this->cfg = $cfg; $this->lang = $lang; $this->client = $client; $this->setDebug((bool) $bDebug); if ($oDB == null) { $this->db = new DB_ConLite(); } elseif (is_object($oDB)) { $this->db = $oDB; } } /** * Setter for debug * * @param bool $bDebug */ public function setDebug($bDebug) { $this->bDebug = (bool) $bDebug; } /** * Main debug function, prints dumps parameter if debugging is enabled * * @param string $msg Some text * @param mixed $var The variable to dump */ protected function _debug($msg, $var) { if (!$this->bDebug) { return; } $dump = '
' . $msg . ': ';
        if (is_array($var) || is_object($var)) {
            $dump .= print_r($var, true);
        } else {
            $dump .= $var;
        }
        $dump .= '
' . "\n"; echo $dump; } } /** * Contenido API - Index Object * * This object creates an index of an article * * Create object with * $oIndex = new Index($db); # where $db is the global Contenido database object. * Start indexing with * $oIndex->start($idart, $aContent); * where $aContent is the complete content of an article specified by its content types. * It looks like * Array ( * [CMS_HTMLHEAD] => Array ( * [1] => Herzlich Willkommen... * [2] => ...auf Ihrer Website! * ) * [CMS_HTML] => Array ( * [1] => Die Inhalte auf dieser Website ... * * The index for keyword 'willkommen' would look like '&12=1(CMS_HTMLHEAD-1)' which means the keyword 'willkommen' occurs 1 times in article with articleId 12 and content type CMS_HTMLHEAD[1]. * * TODO: The basic idea of the indexing process is to take the complete content of an article and to generate normalized index terms * from the content and to store a specific index structure in the relation 'con_keywords'. * To take the complete content is not very flexible. It would be better to differentiate by specific content types or by any content. * The &, =, () and - seperated string is not easy to parse to compute the search result set. * It would be a better idea (and a lot of work) to extend the relation 'con_keywords' to store keywords by articleId (or content source identifier) and content type. * The functions removeSpecialChars, setStopwords, setContentTypes and setCmsOptions should be sourced out into a new helper-class. * Keep in mind that class Search and SearchResult uses an instance of object Index. * Consider character tables in relation 'con_chartable'. */ cInclude('includes', 'functions.encoding.php'); class Index extends SearchBaseAbstract { /** * the content of the cms-types of an article * @var array */ var $keycode = array(); /** * the list of keywords of an article * @var array */ var $keywords = array(); /** * the words, which should not be indexed * @var array */ var $stopwords = array(); /** * the keywords of an article stored in the DB * @var array */ var $keywords_old = array(); /** * the keywords to be deleted * @var array */ var $keywords_del = array(); /** * 'auto' or 'self' * The field 'auto' in table con_keywords is used for automatic indexing. * The value is a string like "&12=2(CMS_HTMLHEAD-1,CMS_HTML-1)", which means a keyword occurs 2 times in article with $idart 12 * and can be found in CMS_HTMLHEAD[1] and CMS_HTML[1]. * The field 'self' can be used in the article properties to index the article manually. * @var string */ var $place; /** * array of cms types * @var array */ var $cms_options = array(); /** * array of all available cms types * * htmlhead - HTML Headline * html - HTML Text * head - Headline (no HTML) * text - Text (no HTML) * img - Upload id of the element * imgdescr - Image description * link - Link (URL) * linktarget - Linktarget (_self, _blank, _top ...) * linkdescr - Linkdescription * swf - Upload id of the element * etc. * * @var array */ var $cms_type = array(); /** * the suffix of all available cms types * @var array */ var $cms_type_suffix = array(); /** * Constructor, set object properties * @param DB_ConLite $oDB Contenido Database object * @return void */ function __construct($oDB = null) { parent::__construct($oDB); $this->setContentTypes(); } /** * Start indexing the article. * * @param int $idart Article Id * @param array $aContent The complete content of an article specified by its content types. * It looks like * Array ( * [CMS_HTMLHEAD] => Array ( * [1] => Herzlich Willkommen... * [2] => ...auf Ihrer Website! * ) * [CMS_HTML] => Array ( * [1] => Die Inhalte auf dieser Website ... * * @param string $place The field where to store the index information in db. * @param array $cms_options One can specify explicitly cms types which should not be indexed. * @param array $aStopwords Array with words which should not be indexed. * @return void */ function start($idart, $aContent, $place = 'auto', $cms_options = array(), $aStopwords = array()) { if (!is_int((int) $idart) || $idart < 0) { return null; } else { $this->idart = $idart; } $this->place = $place; $this->keycode = $aContent; $this->setStopwords($aStopwords); $this->setCmsOptions($cms_options); $this->createKeywords(); $this->getKeywords(); $this->saveKeywords(); $new_keys = array_keys($this->keywords); $old_keys = array_keys($this->keywords_old); $this->keywords_del = array_diff($old_keys, $new_keys); if (count($this->keywords_del) > 0) { $this->deleteKeywords(); } } /** * for each cms-type create index structure. * it looks like * Array ( * [die] => CMS_HTML-1 * [inhalte] => CMS_HTML-1 * [auf] => CMS_HTML-1 CMS_HTMLHEAD-2 * [dieser] => CMS_HTML-1 * [website] => CMS_HTML-1 CMS_HTML-1 CMS_HTMLHEAD-2 * ) * * @param none * @return void */ function createKeywords() { $tmp_keys = array(); $replace = array(' ', '&', '<', '>', '"', '''); // Only create keycodes, if some are available if (is_array($this->keycode)) { foreach ($this->keycode as $idtype => $data) { if ($this->checkCmsType($idtype)) { foreach ($data as $typeid => $code) { $this->_debug('code', $code); $code = stripslashes($code); // remove backslash $code = str_ireplace(array('
', '
'), "\n", $code); // replace HTML line breaks with newlines $code = strip_tags($code); // remove html tags if (strlen($code) > 0) { $code = clHtmlEntityDecode($code); } $this->_debug('code', $code); $tmp_keys = preg_split('/[\s,]+/', trim($code)); // split content by any number of commas or space characters $this->_debug('tmp_keys', $tmp_keys); foreach ($tmp_keys as $value) { $value = strtolower($value); // index terms are stored with lower case if (!in_array($value, $this->stopwords)) { // eliminate stopwords $value = $this->removeSpecialChars($value); if (strlen($value) > 1) { // do not index single characters $this->keywords[$value] = $this->keywords[$value] . $idtype . '-' . $typeid . ' '; } } } } } unset($tmp_keys); } } $this->_debug('keywords', $this->keywords); } /** * generate index_string from index structure and save keywords * The index_string looks like "&12=2(CMS_HTMLHEAD-1,CMS_HTML-1)" * @return void */ function saveKeywords() { $tmp_count = array(); foreach ($this->keywords as $keyword => $count) { $tmp_count = preg_split('/[\s]/', trim($count)); $this->_debug('tmp_count', $tmp_count); $occurrence = count($tmp_count); $tmp_count = array_unique($tmp_count); $cms_types = implode(',', $tmp_count); $index_string = '&' . $this->idart . '=' . $occurrence . '(' . $cms_types . ')'; if (!array_key_exists($keyword, $this->keywords_old)) { // if keyword is new, save index information $nextid = $this->db->nextid($this->cfg['tab']['keywords']); $sql = "INSERT INTO " . $this->cfg['tab']['keywords'] . " (keyword, " . $this->place . ", idlang, idkeyword) VALUES ('" . Contenido_Security::escapeDB($keyword, $this->db) . "', '" . Contenido_Security::escapeDB($index_string, $this->db) . "', " . Contenido_Security::toInteger($this->lang) . ", " . Contenido_Security::toInteger($nextid) . ")"; } else { // if keyword allready exists, create new index_string if (preg_match("/&$this->idart=/", $this->keywords_old[$keyword])) { $index_string = preg_replace("/&$this->idart=[0-9]+\([\w-,]+\)/", $index_string, $this->keywords_old[$keyword]); } else { $index_string = $this->keywords_old[$keyword] . $index_string; } $sql = "UPDATE " . $this->cfg['tab']['keywords'] . " SET " . $this->place . " = '" . $index_string . "' WHERE idlang='" . Contenido_Security::toInteger($this->lang) . "' AND keyword='" . Contenido_Security::escapeDB($keyword, $this->db) . "'"; } $this->_debug('sql', $sql); $this->db->query($sql); } } /** * if keywords don't occur in the article anymore, update index_string and delete keyword if necessary * @param none * @return void */ function deleteKeywords() { foreach ($this->keywords_del as $key_del) { $index_string = preg_replace("/&$this->idart=[0-9]+\([\w-,]+\)/", "", $this->keywords_old[$key_del]); if (strlen($index_string) == 0) { // keyword is not referenced by any article $sql = "DELETE FROM " . $this->cfg['tab']['keywords'] . " WHERE idlang='" . Contenido_Security::toInteger($this->lang) . "' AND keyword='" . Contenido_Security::escapeDB($key_del, $this->db) . "'"; } else { $sql = "UPDATE " . $this->cfg['tab']['keywords'] . " SET " . $this->place . " = '" . $index_string . "' WHERE idlang='" . Contenido_Security::toInteger($this->lang) . "' AND keyword='" . Contenido_Security::escapeDB($key_del, $this->db) . "'"; } $this->_debug('sql', $sql); $this->db->query($sql); } } /** * get the keywords of an article * @param none * @return void */ function getKeywords() { $keys = implode("','", array_keys($this->keywords)); $sql = "SELECT keyword, auto, self FROM " . $this->cfg['tab']['keywords'] . " WHERE idlang=" . Contenido_Security::toInteger($this->lang) . " AND (keyword IN ('" . $keys . "') OR " . $this->place . " REGEXP '&" . Contenido_Security::toInteger($this->idart) . "=')"; $this->_debug('sql', $sql); $this->db->query($sql); $place = $this->place; while ($this->db->next_record()) { $this->keywords_old[$this->db->f('keyword')] = $this->db->f($place); } } /** * remove special characters from index term * @param $key Keyword * @return $key */ function removeSpecialChars($key) { $aSpecialChars = array( "-", "_", "'", ".", "!", "\"", "#", "$", "%", "&", "(", ")", "*", "+", ",", "/", ":", ";", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "`", "{", "|", "}", "~" ); for ($i = 127; $i < 192; $i++) { array_push($aSpecialChars, chr($i)); // some other special characters } // TODO: The transformation of accented characters must depend on the selected encoding of the language of // a client and should not be treated in this method. // modified 2007-10-01, H. Librenz - added as hotfix for encoding problems (doesn't find any words with // umlaut vowels in it since you turn on UTF-8 as language encoding) $sEncoding = getEncodingByLanguage($this->db, $this->lang, $this->cfg); if (strtolower($sEncoding) != 'iso-8859-2') { $key = clHtmlEntities($key, NULL, $sEncoding); } else { $key = htmlentities_iso88592($key); } $aUmlautMap = array( 'Ü' => 'ue', 'ü' => 'ue', 'Ä' => 'ae', 'ä' => 'ae', 'Ö' => 'oe', 'ö' => 'oe', 'ß' => 'ss' ); foreach ($aUmlautMap as $sUmlaut => $sMapped) { $key = str_replace($sUmlaut, $sMapped, $key); } $key = clHtmlEntityDecode($key); $key = str_replace($aSpecialChars, '', $key); return $key; } /** * @modified 2008-04-17, Timo Trautmann - reverse function to removeSpecialChars * (important for syntaxhighlighting searchterm in searchresults) * adds umlauts to search term * @param $key Keyword * @return $key */ function addSpecialUmlauts($key) { $key = clHtmlEntities($key, null, getEncodingByLanguage($this->db, $this->lang, $this->cfg)); $aUmlautMap = array( 'ue' => 'Ü', 'ue' => 'ü', 'ae' => 'Ä', 'ae' => 'ä', 'oe' => 'Ö', 'oe' => 'ö', 'ss' => 'ß' ); foreach ($aUmlautMap as $sUmlaut => $sMapped) { $key = str_replace($sUmlaut, $sMapped, $key); } $key = clHtmlEntityDecode($key); return $key; } /** * set the array of stopwords which should not be indexed * @param array $aStopwords * @return void */ function setStopwords($aStopwords) { if (is_array($aStopwords) && count($aStopwords) > 0) { $this->stopwords = $aStopwords; } } /** * set the cms types * @param none * @return void */ function setContentTypes() { $sql = "SELECT type, idtype FROM " . $this->cfg['tab']['type'] . ' '; $this->_debug('sql', $sql); $this->db->query($sql); while ($this->db->next_record()) { $this->cms_type[$this->db->f('type')] = $this->db->f('idtype'); $this->cms_type_suffix[$this->db->f('idtype')] = substr($this->db->f('type'), 4, strlen($this->db->f('type'))); } } /** * set the cms_options array of cms types which should be treated special * @param none * @return void */ function setCmsOptions($cms_options) { if (is_array($cms_options) && count($cms_options) > 0) { foreach ($cms_options as $opt) { $opt = strtoupper($opt); if (strlen($opt) > 0) { if (!stristr($opt, 'cms_')) { if (in_array($opt, $this->cms_type_suffix)) { $this->cms_options[$opt] = 'CMS_' . $opt; } } else { if (array_key_exists($opt, $this->cms_type)) { $this->cms_options[$opt] = $opt; } } } } } else { $this->cms_options = array(); } } /** * check if the current cms type is in the cms_options array * @param $idtype * * @return bolean */ function checkCmsType($idtype) { $idtype = strtoupper($idtype); return (in_array($idtype, $this->cms_options)) ? false : true; } } /** * Contenido API - Search Object * * This object starts a indexed fulltext search * * TODO: * The way to set the search options could be done much more better! * The computation of the set of searchable articles should not be treated in this class. * It is better to compute the array of searchable articles from the outside and to pass the array of searchable articles as parameter. * Avoid foreach loops. * * Use object with * * $options = array('db' => 'regexp', // use db function regexp * 'combine' => 'or'); // combine searchwords with or * * The range of searchable articles is by default the complete content which is online and not protected. * * With option 'searchable_articles' you can define your own set of searchable articles. * If parameter 'searchable_articles' is set the options 'cat_tree', 'categories', 'articles', 'exclude', 'artspecs', * 'protected', 'dontshowofflinearticles' don't have any effect. * * $options = array('db' => 'regexp', // use db function regexp * 'combine' => 'or', // combine searchwords with or * 'searchable_articles' => array(5, 6, 9, 13)); * * One can define the range of searchable articles by setting the parameter 'exclude' to false which means the range of categories * defined by parameter 'cat_tree' or 'categories' and the range of articles defined by parameter 'articles' is included. * * $options = array('db' => 'regexp', // use db function regexp * 'combine' => 'or', // combine searchwords with or * 'exclude' => false, // => searchrange specified in 'cat_tree', 'categories' and 'articles' is included * 'cat_tree' => array(12), // tree with root 12 included * 'categories' => array(100,111), // categories 100, 111 included * 'articles' => array(33), // article 33 included * 'artspecs' => array(2, 3), // array of article specifications => search only articles with these artspecs * 'res_per_page' => 2, // results per page * 'protected' => true); // => do not search articles or articles in categories which are offline or protected * 'dontshowofflinearticles' => false); // => search offline articles or articles in categories which are offline * * You can build the complement of the range of searchable articles by setting the parameter 'exclude' to true which means the range of categories * defined by parameter 'cat_tree' or 'categories' and the range of articles defined by parameter 'articles' is excluded from search. * * $options = array('db' => 'regexp', // use db function regexp * 'combine' => 'or', // combine searchwords with or * 'exclude' => true, // => searchrange specified in 'cat_tree', 'categories' and 'articles' is excluded * 'cat_tree' => array(12), // tree with root 12 excluded * 'categories' => array(100,111), // categories 100, 111 excluded * 'articles' => array(33), // article 33 excluded * 'artspecs' => array(2, 3), // array of article specifications => search only articles with these artspecs * 'res_per_page' => 2, // results per page * 'protected' => true); // => do not search articles or articles in categories which are offline or protected * 'dontshowofflinearticles' => false); // => search offline articles or articles in categories which are offline * * $search = new Search($options); * * $cms_options = array("htmlhead", "html", "head", "text", "imgdescr", "link", "linkdescr"); * search only in these cms-types * $search->setCmsOptions($cms_options); * * $search_result = $search->searchIndex($searchword, $searchwordex); // start search * * The search result structure has following form * Array ( * [20] => Array ( * [CMS_HTML] => Array ( * [0] => 1 * [1] => 1 * [2] => 1 * ) * [keyword] => Array ( * [0] => content * [1] => contenido * [2] => wwwcontenidoorg * ) * [search] => Array ( * [0] => con * [1] => con * [2] => con * ) * [occurence] => Array ( * [0] => 1 * [1] => 5 * [2] => 1 * ) * [similarity] => 60 * ) * ) * * The keys of the array are the article ID's found by search. * * Searching 'con' matches keywords 'content', 'contenido' and 'wwwcontenidoorg' in article with ID 20 in content type CMS_HTML[1]. * The search term occurs 7 times. * The maximum similarity between searchterm and matching keyword is 60%. * * with $oSearchResults = new SearchResult($search_result, 10); * one can rank and display the results * * @version 1.0.1 * * @author Willi Man * @copyright four for business AG */ class Search extends SearchBaseAbstract { /** * Instance of class Index * @var object */ var $index; /** * array of available cms types * @var array */ var $cms_type = array(); /** * suffix of available cms types * @var array */ var $cms_type_suffix = array(); /** * the search words * @var array */ var $search_words = array(); /** * the words which should be excluded from search * @var array */ var $search_words_exclude = array(); /** * type of db search * like => 'sql like', regexp => 'sql regexp' * @var string */ var $search_option; /** * logical combination of searchwords (and, or) * @var string */ var $search_combination; /** * array of searchable articles * @var array */ var $searchable_arts = array(); /** * article specifications * @var array */ var $article_specs = array(); /** * If $protected = true => do not search articles which are offline or articles in catgeories which are offline (protected) * @var boolean */ var $protected; /** * If $dontshowofflinearticles = false => search offline articles or articles in categories which are offline * @var boolean */ var $dontshowofflinearticles; /** * If $exclude = true => the specified search range is excluded from search, otherwise included * @var boolean */ var $exclude; /** * Array of article id's with information about cms-types, occurence of keyword/searchword, similarity ... * @var array */ var $search_result = array(); /** * Constructor * * @param array $options * $options['db'] 'regexp' => DB search with REGEXP; 'like' => DB search with LIKE; 'exact' => exact match; * $options['combine'] 'and', 'or' Combination of search words with AND, OR * $options['exclude'] 'true' => searchrange specified in 'cat_tree', 'categories' and 'articles' is excluded; 'false' => searchrange specified in 'cat_tree', 'categories' and 'articles' is included * $options['cat_tree'] e.g. array(8) => The complete tree with root 8 is in/excluded from search * $options['categories'] e.g. array(10, 12) => Categories 10, 12 in/excluded * $options['articles'] e.g. array(23) => Article 33 in/excluded * $options['artspecs'] => e.g. array(2, 3) => search only articles with certain article specifications * $options['protected'] 'true' => do not search articles which are offline (locked) or articles in catgeories which are offline (protected) * $options['dontshowofflinearticles'] 'false' => search offline articles or articles in categories which are offline * $options['searchable_articles'] array of article ID's which should be searchable * @param DB_ConLite $oDB Optional database instance * @return void */ function __construct($options, $oDB = null) { parent::__construct($oDB); $this->index = new Index($oDB); $this->cms_type = $this->index->cms_type; $this->cms_type_suffix = $this->index->cms_type_suffix; $this->search_option = (array_key_exists('db', $options)) ? strtolower($options['db']) : 'regexp'; $this->search_combination = (array_key_exists('combine', $options)) ? strtolower($options['combine']) : 'or'; $this->protected = (array_key_exists('protected', $options)) ? $options['protected'] : true; $this->dontshowofflinearticles = (array_key_exists('dontshowofflinearticles', $options)) ? $options['dontshowofflinearticles'] : false; $this->exclude = (array_key_exists('exclude', $options)) ? $options['exclude'] : true; $this->article_specs = (array_key_exists('artspecs', $options) && is_array($options['artspecs'])) ? $options['artspecs'] : array(); $this->index->setCmsOptions($this->cms_type_suffix); if (array_key_exists('searchable_articles', $options) && is_array($options['searchable_articles'])) { $this->searchable_arts = $options['searchable_articles']; } else { $this->searchable_arts = $this->getSearchableArticles($options); } $this->intMinimumSimilarity = 50; # minimum similarity between searchword and keyword in percent } /** * indexed fulltext search * @param string $searchwords The search words * @param string $searchwords_exclude The words, which should be excluded from search * @return void */ function searchIndex($searchwords, $searchwords_exclude = '') { if (strlen(trim($searchwords)) > 0) { $this->search_words = $this->stripWords($searchwords); } else { return false; } if (strlen(trim($searchwords_exclude)) > 0) { $this->search_words_exclude = $this->stripWords($searchwords_exclude); } $tmp_searchwords = array(); foreach ($this->search_words as $word) { if ($this->search_option == 'like') { $word = "'%" . $word . "%'"; } elseif ($this->search_option == 'exact') { $word = "'" . $word . "'"; } array_push($tmp_searchwords, $word); } if (count($this->search_words_exclude) > 0) { foreach ($this->search_words_exclude as $word) { if ($this->search_option == 'like') { $word = "'%" . $word . "%'"; } elseif ($this->search_option == 'exact') { $word = "'" . $word . "'"; } array_push($tmp_searchwords, $word); array_push($this->search_words, $word); } } if (count($tmp_searchwords) == 0) return false; if ($this->search_option == 'regexp') { // regexp search $kwSql = "keyword REGEXP '" . implode('|', $tmp_searchwords) . "'"; } elseif ($this->search_option == 'like') { // like search $search_like = implode(" OR keyword LIKE ", Contenido_Security::escapeDB($tmp_searchwords, $this->db)); $kwSql = "keyword LIKE '" . $search_like; } elseif ($this->search_option == 'exact') { // exact match $search_exact = implode(" OR keyword = ", Contenido_Security::escapeDB($tmp_searchwords, $this->db)); $kwSql = "keyword LIKE '" . $search_exact; } $sql = "SELECT keyword, auto FROM " . $this->cfg['tab']['keywords'] . " WHERE idlang=" . Contenido_Security::toInteger($this->lang) . " AND " . $kwSql . " "; $this->_debug('sql', $sql); $this->db->query($sql); while ($this->db->next_record()) { $tmp_index_string = preg_split('/&/', $this->db->f('auto'), -1, PREG_SPLIT_NO_EMPTY); $this->_debug('index', $this->db->f('auto')); $tmp_index = array(); foreach ($tmp_index_string as $string) { $tmp_string = preg_replace('/[=\(\)]/', ' ', $string); $tmp_index[] = preg_split('/\s/', $tmp_string, -1, PREG_SPLIT_NO_EMPTY); } $this->_debug('tmp_index', $tmp_index); foreach ($tmp_index as $string) { $artid = $string[0]; // filter nonsearchable articles if (in_array($artid, $this->searchable_arts)) { $cms_place = $string[2]; $keyword = $this->db->f('keyword'); $percent = 0; $similarity = 0; foreach ($this->search_words as $word) { similar_text($word, $keyword, $percent); // computes similarity between searchword and keyword in percent if ($percent > $similarity) { $similarity = $percent; $searchword = $word; } } $tmp_cmstype = preg_split('/[,]/', $cms_place, -1, PREG_SPLIT_NO_EMPTY); $this->_debug('tmp_cmstype', $tmp_cmstype); $tmp_cmstype2 = array(); foreach ($tmp_cmstype as $type) { $tmp_cmstype2[] = preg_split('/-/', $type, -1, PREG_SPLIT_NO_EMPTY); } $this->_debug('tmp_cmstype2', $tmp_cmstype2); foreach ($tmp_cmstype2 as $type) { if (!$this->index->checkCmsType($type[0])) { // search for specified cms-types if ($similarity >= $this->intMinimumSimilarity) { // include article into searchresult set only if // similarity between searchword and keyword is big enough $this->search_result[$artid][$type[0]][] = $type[1]; $this->search_result[$artid]['keyword'][] = $this->db->f('keyword'); $this->search_result[$artid]['search'][] = $searchword; $this->search_result[$artid]['occurence'][] = $string[1]; $this->search_result[$artid]['debug_similarity'][] = $percent; if ($similarity > $this->search_result[$artid]['similarity']) { $this->search_result[$artid]['similarity'] = $similarity; } } } } } } } if ($this->search_combination == 'and') { // all search words must appear in the article foreach ($this->search_result as $article => $val) { if (!count(array_diff($this->search_words, $val['search'])) == 0) { //$this->rank_structure[$article] = $rank[$article]; unset($this->search_result[$article]); } } } if (count($this->search_words_exclude) > 0) { // search words to be excluded must not appear in article foreach ($this->search_result as $article => $val) { if (!count(array_intersect($this->search_words_exclude, $val['search'])) == 0) { //$this->rank_structure[$article] = $rank[$article]; unset($this->search_result[$article]); } } } $this->_debug('$this->search_result', $this->search_result); $this->_debug('$this->searchable_arts', $this->searchable_arts); return $this->search_result; } /** * @param $cms_options The cms-types (htmlhead, html, ...) which should explicitly be searched * @return void */ function setCmsOptions($cms_options) { if (is_array($cms_options) && count($cms_options) > 0) { $this->index->setCmsOptions($cms_options); } } /** * @param $searchwords The search-words * @return Array of stripped search-words */ function stripWords($searchwords) { $tmp_words = array(); $searchwords = stripslashes($searchwords); // remove backslash $searchwords = strip_tags($searchwords); // remove html tags $tmp_words = preg_split('/[\s,]+/', trim($searchwords)); // split the phrase by any number of commas or space characters $tmp_searchwords = array(); foreach ($tmp_words as $word) { $word = strtolower($word); $word = $this->index->removeSpecialChars(trim($word)); if (strlen($word) > 1) { array_push($tmp_searchwords, $word); } } return array_unique($tmp_searchwords); } /** * Returns the category tree array. * * @param int $cat_start Root of a category tree * @return array Category Tree * @todo This is not the job for search, should be oursourced... */ function getSubTree($cat_start) { $sql = "SELECT B.idcat, B.parentid FROM " . $this->cfg['tab']['cat_tree'] . " AS A, " . $this->cfg['tab']['cat'] . " AS B, " . $this->cfg['tab']['cat_lang'] . " AS C WHERE A.idcat = B.idcat AND B.idcat = C.idcat AND C.idlang = '" . Contenido_Security::toInteger($this->lang) . "' AND B.idclient = '" . Contenido_Security::toInteger($this->client) . "' ORDER BY idtree"; $this->_debug('sql', $sql); $this->db->query($sql); $aSubCats = array(); $i = false; while ($this->db->next_record()) { if ($this->db->f('parentid') < $cat_start) { // ending part of tree $i = false; } if ($this->db->f('idcat') == $cat_start) { // starting part of tree $i = true; } if ($i == true) { $aSubCats[] = $this->db->f('idcat'); } } return $aSubCats; } /** * Returns list of searchable article ids. * * @param array $search_range * @return array Articles in specified search range */ function getSearchableArticles($search_range) { $cat_range = array(); if (array_key_exists('cat_tree', $search_range) && is_array($search_range['cat_tree'])) { if (count($search_range['cat_tree']) > 0) { foreach ($search_range['cat_tree'] as $cat) { $cat_range = array_merge($cat_range, $this->getSubTree($cat)); } } } if (array_key_exists('categories', $search_range) && is_array($search_range['categories'])) { if (count($search_range['categories']) > 0) { $cat_range = array_merge($cat_range, $search_range['categories']); } } $cat_range = array_unique($cat_range); $sCatRange = implode("','", $cat_range); if (array_key_exists('articles', $search_range) && is_array($search_range['articles'])) { if (count($search_range['articles']) > 0) { $sArtRange = implode("','", $search_range['articles']); } else { $sArtRange = ''; } } $id_arts = array(); if ($this->protected == true) { $protected = " C.public = '1' AND C.visible = '1' AND B.online = '1' "; } else { if ($this->dontshowofflinearticles == true) { $protected = " C.visible = '1' AND B.online = '1' "; } else { $protected = " 1 "; } } if ($this->exclude == true) { // exclude searchrange $sSearchRange = " A.idcat NOT IN ('" . $sCatRange . "') AND B.idart NOT IN ('" . $sArtRange . "') AND "; } else { // include searchrange if (strlen($sArtRange) > 0) { $sSearchRange = " A.idcat IN ('" . $sCatRange . "') AND B.idart IN ('" . $sArtRange . "') AND "; } else { $sSearchRange = " A.idcat IN ('" . $sCatRange . "') AND "; } } if (count($this->article_specs) > 0) { $sArtSpecs = " B.artspec IN ('" . implode("','", $this->article_specs) . "') AND "; } else { $sArtSpecs = ''; } $sql = "SELECT A.idart FROM " . $this->cfg["tab"]["cat_art"] . " as A, " . $this->cfg["tab"]["art_lang"] . " as B, " . $this->cfg["tab"]["cat_lang"] . " as C WHERE " . $sSearchRange . " B.idlang = '" . Contenido_Security::toInteger($this->lang) . "' AND C.idlang = '" . Contenido_Security::toInteger($this->lang) . "' AND A.idart = B.idart AND A.idcat = C.idcat AND " . $sArtSpecs . " " . $protected . " "; $this->_debug('sql', $sql); $this->db->query($sql); while ($this->db->next_record()) { $id_arts[] = $this->db->f('idart'); } return $id_arts; } /** * Fetch all article specifications which are online, * * @return array Array of article specification Ids */ function getArticleSpecifications() { $sql = "SELECT idartspec FROM " . $this->cfg['tab']['art_spec'] . " WHERE client = " . Contenido_Security::toInteger($this->client) . " AND lang = " . Contenido_Security::toInteger($this->lang) . " AND online = 1 "; $this->_debug('sql', $sql); $this->db->query($sql); $aArtspec = array(); while ($this->db->next_record()) { $aArtspec[] = $this->db->f('idartspec'); } return $aArtspec; } /** * Set article specification * @param int $iArtspecID * @return void */ function setArticleSpecification($iArtspecID) { array_push($this->article_specs, $iArtspecID); } /** * Add all article specifications matching name of article specification (client dependent but language independent) * @param string $sArtSpecName * @return void */ function addArticleSpecificationsByName($sArtSpecName) { if (!isset($sArtSpecName) || strlen($sArtSpecName) == 0) { return false; } $sql = "SELECT idartspec FROM " . $this->cfg['tab']['art_spec'] . " WHERE client = " . Contenido_Security::toInteger($this->client) . " AND artspec = '" . Contenido_Security::escapeDB($sArtSpecName, $this->db) . "' "; $this->_debug('sql', $sql); $this->db->query($sql); while ($this->db->next_record()) { array_push($this->article_specs, $this->db->f('idartspec')); } } } /** * Contenido API - SearchResult Object * * This object ranks and displays the result of the indexed fulltext search. * If you are not comfortable with this API feel free to use your own methods to display the search results. * The search result is basically an array with article ID's. * * If $search_result = $search->searchIndex($searchword, $searchwordex); * * use object with * * $oSearchResults = new SearchResult($search_result, 10); * * $oSearchResults->setReplacement('', ''); // html-tags to emphasize the located searchwords * * $num_res = $oSearchResults->getNumberOfResults(); * $num_pages = $oSearchResults->getNumberOfPages(); * $res_page = $oSearchResults->getSearchResultPage(1); // first result page * foreach ($res_page as $key => $val) { * $headline = $oSearchResults->getSearchContent($key, 'HTMLHEAD'); * $first_headline = $headline[0]; * $text = $oSearchResults->getSearchContent($key, 'HTML'); * $first_text = $text[0]; * $similarity = $oSearchResults->getSimilarity($key); * $iOccurrence = $oSearchResults->getOccurrence($key); * } * * @version 1.0.0 * * @author Willi Man * @copyright four for business AG * */ class SearchResult extends SearchBaseAbstract { /** * Instance of class Index * @var object */ var $index; /** * Number of results * @var int */ var $results; /** * Number of result pages * @var int */ var $pages; /** * Current result page * @var int */ var $result_page; /** * Results per page to display * @var int */ var $result_per_page; /** * Array of html-tags to emphasize the searchwords * @var array */ var $replacement = array(); /** * Array of article id's with ranking information * @var array */ var $rank_structure = array(); /** * Array of result-pages with array's of article id's * @var array */ var $ordered_search_result = array(); /** * Array of article id's with information about cms-types, occurence of keyword/searchword, similarity ... * @var array */ var $search_result = array(); /** * Compute ranking factor for each search result and order the search results by ranking factor * NOTE: The ranking factor is the sum of occurences of matching searchterms weighted by similarity (in %) between searchword * and matching word in the article. * TODO: One can think of more sophisticated ranking strategies. One could use the content type information for example * because a matching word in the headline (CMS_HEADLINE[1]) could be weighted more than a matching word in the text (CMS_HTML[1]). * * @param array $search_result List of article ids * @param int $result_per_page Number of items per page * @param DB_ConLite $oDB Optional db instance * @param bool $bDebug Optional flag to enable debugging */ function __construct($search_result, $result_per_page, $oDB = null, $bDebug = false) { parent::__construct($oDB, $bDebug); $this->index = new Index($oDB); $this->search_result = $search_result; $this->_debug('$this->search_result', $this->search_result); $this->result_per_page = $result_per_page; $this->results = count($this->search_result); # compute ranking factor for each search result foreach ($this->search_result as $article => $val) { $this->rank_structure[$article] = $this->getOccurrence($article) * ( $this->getSimilarity($article) / 100); } $this->_debug('$this->rank_structure', $this->rank_structure); $this->setOrderedSearchResult($this->rank_structure, $this->result_per_page); $this->pages = count($this->ordered_search_result); $this->_debug('$this->ordered_search_result', $this->ordered_search_result); } /** * @param $ranked_search * @param $result_per_page * @return void */ function setOrderedSearchResult($ranked_search, $result_per_page) { asort($ranked_search); $sorted_rank = array_reverse($ranked_search, true); if (isset($result_per_page) && $result_per_page > 0) { $split_result = array(); $split_result = array_chunk($sorted_rank, $result_per_page, true); $this->ordered_search_result = $split_result; } else { $this->ordered_search_result[] = $sorted_rank; } } /** * @param $cms_type * @param $art_id Id of an article * @return Content of an article, specified by it's content type */ function getContent($art_id, $cms_type, $id = 0) { $article = new Article($art_id, $this->client, $this->lang); return $article->getContent($cms_type, $id); } /** * @param $cms_type Content type * @param $art_id Id of an article * @return Content of an article in search result, specified by its type */ function getSearchContent($art_id, $cms_type, $cms_nr = NULL) { $cms_type = strtoupper($cms_type); if (strlen($cms_type) > 0) { if (!stristr($cms_type, 'cms_')) { if (in_array($cms_type, $this->index->cms_type_suffix)) { $cms_type = 'CMS_' . $cms_type; } } else { if (!array_key_exists($cms_type, $this->index->cms_type)) { return array(); } } } $article = new Article($art_id, $this->client, $this->lang); $content = array(); if (isset($this->search_result[$art_id][$cms_type])) { // if searchword occurs in cms_type $search_words = $this->search_result[$art_id]['search']; $search_words = array_unique($search_words); $id_type = $this->search_result[$art_id][$cms_type]; $id_type = array_unique($id_type); if (isset($cms_nr) && is_numeric($cms_nr)) { // get content of cms_type[cms_nr] //build consistent escaped string(Timo Trautmann) 2008-04-17 $cms_content = clHtmlEntities(clHtmlEntityDecode(strip_tags($article->getContent($cms_type, $cms_nr)))); if (count($this->replacement) == 2) { foreach ($search_words as $word) { //build consistent escaped string, replace ae ue .. with original html entities (Timo Trautmann) 2008-04-17 $word = clHtmlEntities(clHtmlEntityDecode($this->index->addSpecialUmlauts($word))); $match = array(); preg_match("/$word/i", $cms_content, $match); if (isset($match[0])) { $pattern = $match[0]; $replacement = $this->replacement[0] . $pattern . $this->replacement[1]; $cms_content = preg_replace("/$pattern/i", $replacement, $cms_content); // emphasize located searchwords } } } $content[] = htmlspecialchars_decode($cms_content); } else { // get content of cms_type[$id], where $id are the cms_type numbers found in search foreach ($id_type as $id) { $cms_content = strip_tags($article->getContent($cms_type, $id)); if (count($this->replacement) == 2) { foreach ($search_words as $word) { preg_match("/$word/i", $cms_content, $match); if (isset($match[0])) { $pattern = $match[0]; $replacement = $this->replacement[0] . $pattern . $this->replacement[1]; $cms_content = preg_replace("/$pattern/i", $replacement, $cms_content); // emphasize located searchwords } } } $content[] = $cms_content; } } } else { // searchword was not found in cms_type if (isset($cms_nr) && is_numeric($cms_nr)) { $content[] = strip_tags($article->getContent($cms_type, $cms_nr)); } else { $art_content = $article->getContent($cms_type); if (count($art_content) > 0) { foreach ($art_content as $val) { $content[] = strip_tags($val); } } } } return $content; } /** * Returns articles in page. * * @param int $page_id * @return array Artices in page $page_id */ function getSearchResultPage($page_id) { $this->result_page = $page_id; $result_page = $this->ordered_search_result[$page_id - 1]; return $result_page; } /** * Returns number of result pages * @return int */ function getNumberOfPages() { return $this->pages; } /** * Returns number of results * @return int */ function getNumberOfResults() { return $this->results; } /** * @param $art_id Id of an article * @return Similarity between searchword and matching word in article */ function getSimilarity($art_id) { return $this->search_result[$art_id]['similarity']; } /** * @param $art_id Id of an article * @return Number of matching searchwords found in article */ function getOccurrence($art_id) { $aOccurence = $this->search_result[$art_id]['occurence']; $iSumOfOccurence = 0; for ($i = 0; $i < count($aOccurence); $i++) { $iSumOfOccurence += $aOccurence[$i]; } return $iSumOfOccurence; } /** * @param string $rep1 The opening html-tag to emphasize the searchword e.g. '' * @param string $rep2 The closing html-tag e.g. '' * @return void */ function setReplacement($rep1, $rep2) { if (strlen(trim($rep1)) > 0 && strlen(trim($rep2)) > 0) { array_push($this->replacement, $rep1); array_push($this->replacement, $rep2); } } /** * @param $artid * @return Category Id * @todo Is not job of search, should be outsourced! */ function getArtCat($artid) { $sql = "SELECT idcat FROM " . $this->cfg['tab']['cat_art'] . " WHERE idart = " . Contenido_Security::toInteger($artid) . " "; $this->db->query($sql); if ($this->db->next_record()) { return $this->db->f('idcat'); } } }