From 6729446154807170ab00fa3309652fa2911ce15f Mon Sep 17 00:00:00 2001 From: "o.pinke" Date: Wed, 22 Feb 2023 15:34:23 +0100 Subject: [PATCH] bugfixes search and keywords generation --- conlite/classes/class.article.collector.php | 8 +- conlite/classes/class.search.php | 86 ++++++++++++++----- .../contenido/class.articlelanguage.php | 42 ++++----- conlite/includes/functions.con2.php | 62 +++++-------- conlite/includes/functions.general.php | 1 - 5 files changed, 110 insertions(+), 89 deletions(-) diff --git a/conlite/classes/class.article.collector.php b/conlite/classes/class.article.collector.php index 0d9a25e..26a3bdb 100644 --- a/conlite/classes/class.article.collector.php +++ b/conlite/classes/class.article.collector.php @@ -34,7 +34,7 @@ class cArticleCollector implements SeekableIterator, Countable { protected $_aStartArticles = array(); protected $_aOptions = array(); protected $_aOptionsDefault = array(); - private $_bAsObject = TRUE; + private $_bAsObject = true; /** * @@ -65,14 +65,12 @@ class cArticleCollector implements SeekableIterator, Countable { } if (count($this->_aStartArticles) > 0) { - print_r($this->_aStartArticles); if ($this->_aOptions['start'] == false) { $oArtLangColl->setWhere("cApiArticleLanguageCollection.idartlang", $this->_aStartArticles, "NOTIN"); //$sqlStartArticles = "a.idartlang NOT IN ('" . implode("','", $this->_startArticles) . "') AND "; } if ($this->_aOptions['startonly'] == true) { - echo "startonly"; $oArtLangColl->setWhere("cApiArticleLanguageCollection.idartlang", $this->_aStartArticles, "IN"); //$sqlStartArticles = "a.idartlang IN ('" . implode("','", $this->_startArticles) . "') AND "; } @@ -89,7 +87,6 @@ class cArticleCollector implements SeekableIterator, Countable { $oArtLangColl->setWhere("cApiArticleLanguageCollection.idlang", $this->_aOptions['lang']); $oArtLangColl->query(); - echo $oArtLangColl->_lastSQL; if ($oArtLangColl->count() > 0) { $aTable = $oArtLangColl->fetchTable(); //echo $oArtLangColl->_lastSQL; @@ -97,7 +94,6 @@ class cArticleCollector implements SeekableIterator, Countable { foreach ($aTable as $aItem) { $this->_aArticles[] = $aItem['idartlang']; } - print_r($this->_aArticles); } } @@ -162,7 +158,7 @@ class cArticleCollector implements SeekableIterator, Countable { * * @return cApiArticleLanguage|int returns article language object or idartlang */ - public function current() { + public function current() :cApiArticleLanguage|int{ $iIdartlang = $this->_aArticles[$this->_iCurrentPosition]; if ($this->_bAsObject) { $oArticle = new cApiArticleLanguage($iIdartlang); diff --git a/conlite/classes/class.search.php b/conlite/classes/class.search.php index b2f6599..5d314b5 100644 --- a/conlite/classes/class.search.php +++ b/conlite/classes/class.search.php @@ -225,13 +225,13 @@ class Index extends SearchBaseAbstract { * * @var array */ - var $cms_type = array(); + protected static $_cms_type = []; /** * the suffix of all available cms types * @var array */ - var $cms_type_suffix = array(); + protected static $_cms_type_suffix = []; /** * Constructor, set object properties @@ -270,6 +270,8 @@ class Index extends SearchBaseAbstract { $this->idart = $idart; } + $this->_debug('Start Index for ', $this->idart); + $this->place = $place; $this->keycode = $aContent; $this->setStopwords($aStopwords); @@ -283,7 +285,14 @@ class Index extends SearchBaseAbstract { $old_keys = array_keys($this->keywords_old); $this->keywords_del = array_diff($old_keys, $new_keys); - + /* + echo '
';
+          print_r($new_keys);
+          print_r($old_keys);
+          print_r($this->keywords_del);
+          echo '
'; + * + */ if (count($this->keywords_del) > 0) { $this->deleteKeywords(); } @@ -312,7 +321,7 @@ class Index extends SearchBaseAbstract { foreach ($this->keycode as $idtype => $data) { if ($this->checkCmsType($idtype)) { foreach ($data as $typeid => $code) { - $this->_debug('code', $code); + $this->_debug('createKeywords: raw code from data array', $code); $code = stripslashes($code); // remove backslash $code = str_ireplace(array('
', '
'), "\n", $code); // replace HTML line breaks with newlines @@ -320,13 +329,18 @@ class Index extends SearchBaseAbstract { if (strlen($code) > 0) { $code = clHtmlEntityDecode($code); } - $this->_debug('code', $code); + $this->_debug('createKeywords: code after clean', $code); $tmp_keys = preg_split('/[\s,]+/', trim($code)); // split content by any number of commas or space characters - $this->_debug('tmp_keys', $tmp_keys); + $this->_debug('createKeywords: tmp_keys', $tmp_keys); foreach ($tmp_keys as $value) { $value = strtolower($value); // index terms are stored with lower case + $value = preg_replace('/[^\w]+/u', '', $value); + + if (empty(trim($value))) { + continue; + } if (!in_array($value, $this->stopwords)) { // eliminate stopwords @@ -335,6 +349,7 @@ class Index extends SearchBaseAbstract { if (strlen($value) > 1) { // do not index single characters $this->keywords[$value] = $this->keywords[$value] . $idtype . '-' . $typeid . ' '; + $this->_debug('createKeywords: entry array keywords', $this->keywords); } } } @@ -345,7 +360,7 @@ class Index extends SearchBaseAbstract { } } - $this->_debug('keywords', $this->keywords); + $this->_debug('createKeywords: keywords returned', $this->keywords); } /** @@ -357,9 +372,10 @@ class Index extends SearchBaseAbstract { $tmp_count = array(); foreach ($this->keywords as $keyword => $count) { + $bProceed = true; + $this->_debug('keyword', $keyword); $tmp_count = preg_split('/[\s]/', trim($count)); $this->_debug('tmp_count', $tmp_count); - $occurrence = count($tmp_count); $tmp_count = array_unique($tmp_count); $cms_types = implode(',', $tmp_count); @@ -376,8 +392,12 @@ class Index extends SearchBaseAbstract { ('" . Contenido_Security::escapeDB($keyword, $this->db) . "', '" . Contenido_Security::escapeDB($index_string, $this->db) . "', " . Contenido_Security::toInteger($this->lang) . ", " . Contenido_Security::toInteger($nextid) . ")"; } else { // if keyword allready exists, create new index_string - if (preg_match("/&$this->idart=/", $this->keywords_old[$keyword])) { - $index_string = preg_replace("/&$this->idart=[0-9]+\([\w-,]+\)/", $index_string, $this->keywords_old[$keyword]); + if (preg_match("/&" . $this->idart . "=/", $this->keywords_old[$keyword])) { + $index_string = preg_replace("/&" . $this->idart . "=[0-9]+\([,\w-]+\)/", $index_string, $this->keywords_old[$keyword]); + if ($index_string === $this->keywords_old[$keyword]) { + $bProceed = false; + $this->_debug('db update', 'no update needed'); + } } else { $index_string = $this->keywords_old[$keyword] . $index_string; } @@ -386,9 +406,11 @@ class Index extends SearchBaseAbstract { SET " . $this->place . " = '" . $index_string . "' WHERE idlang='" . Contenido_Security::toInteger($this->lang) . "' AND keyword='" . Contenido_Security::escapeDB($keyword, $this->db) . "'"; } - $this->_debug('sql', $sql); - $this->db->query($sql); + if ($bProceed) { + $this->_debug('sql', $sql); + $this->db->query($sql); + } } } @@ -431,7 +453,7 @@ class Index extends SearchBaseAbstract { idlang=" . Contenido_Security::toInteger($this->lang) . " AND (keyword IN ('" . $keys . "') OR " . $this->place . " REGEXP '&" . Contenido_Security::toInteger($this->idart) . "=')"; - $this->_debug('sql', $sql); + $this->_debug('getKeywords: sql', $sql); $this->db->query($sql); @@ -440,6 +462,8 @@ class Index extends SearchBaseAbstract { while ($this->db->next_record()) { $this->keywords_old[$this->db->f('keyword')] = $this->db->f($place); } + + $this->_debug('getKeywords: array keywords_old', $this->keywords_old); } /** @@ -448,6 +472,7 @@ class Index extends SearchBaseAbstract { * @return $key */ function removeSpecialChars($key) { + $aSpecialChars = array( "-", "_", "'", ".", "!", "\"", "#", "$", "%", "&", "(", ")", "*", "+", ",", "/", ":", ";", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "`", "{", "|", "}", "~" @@ -461,6 +486,7 @@ class Index extends SearchBaseAbstract { // a client and should not be treated in this method. // modified 2007-10-01, H. Librenz - added as hotfix for encoding problems (doesn't find any words with // umlaut vowels in it since you turn on UTF-8 as language encoding) + $sEncoding = getEncodingByLanguage($this->db, $this->lang, $this->cfg); if (strtolower($sEncoding) != 'iso-8859-2') { @@ -486,6 +512,9 @@ class Index extends SearchBaseAbstract { $key = clHtmlEntityDecode($key); $key = str_replace($aSpecialChars, '', $key); + ini_set('mbstring.substitute_character', "none"); + $key = mb_convert_encoding($key, 'UTF-8', 'UTF-8'); + return $key; } @@ -516,6 +545,21 @@ class Index extends SearchBaseAbstract { return $key; } + /** + * + * @return array array with arrays of type and typesuffix + */ + public function getContentTypes(): array { + if (empty(self::$_cms_type)) { + $this->setContentTypes(); + } + + return array( + 'cms_type' => self::$_cms_type, + 'cms_type_suffix' => self::$_cms_type_suffix + ); + } + /** * set the array of stopwords which should not be indexed * @param array $aStopwords @@ -537,8 +581,8 @@ class Index extends SearchBaseAbstract { $this->_debug('sql', $sql); $this->db->query($sql); while ($this->db->next_record()) { - $this->cms_type[$this->db->f('type')] = $this->db->f('idtype'); - $this->cms_type_suffix[$this->db->f('idtype')] = substr($this->db->f('type'), 4, strlen($this->db->f('type'))); + self::$_cms_type[$this->db->f('type')] = $this->db->f('idtype'); + self::$_cms_type_suffix[$this->db->f('idtype')] = substr($this->db->f('type'), 4, strlen($this->db->f('type'))); } } @@ -554,11 +598,11 @@ class Index extends SearchBaseAbstract { if (strlen($opt) > 0) { if (!stristr($opt, 'cms_')) { - if (in_array($opt, $this->cms_type_suffix)) { + if (in_array($opt, $this->getContentTypes()['cms_type_suffix'])) { $this->cms_options[$opt] = 'CMS_' . $opt; } } else { - if (array_key_exists($opt, $this->cms_type)) { + if (array_key_exists($opt, $this->getContentTypes()['cms_type'])) { $this->cms_options[$opt] = $opt; } } @@ -788,8 +832,8 @@ class Search extends SearchBaseAbstract { $this->index = new Index($oDB); - $this->cms_type = $this->index->cms_type; - $this->cms_type_suffix = $this->index->cms_type_suffix; + $this->cms_type = $this->index->getContentTypes()['cms_type']; + $this->cms_type_suffix = $this->index->getContentTypes()['cms_type_suffix']; $this->search_option = (array_key_exists('db', $options)) ? strtolower($options['db']) : 'regexp'; $this->search_combination = (array_key_exists('combine', $options)) ? strtolower($options['combine']) : 'or'; @@ -1339,11 +1383,11 @@ class SearchResult extends SearchBaseAbstract { $cms_type = strtoupper($cms_type); if (strlen($cms_type) > 0) { if (!stristr($cms_type, 'cms_')) { - if (in_array($cms_type, $this->index->cms_type_suffix)) { + if (in_array($cms_type, $this->index->getContentTypes()['cms_type'])) { $cms_type = 'CMS_' . $cms_type; } } else { - if (!array_key_exists($cms_type, $this->index->cms_type)) { + if (!array_key_exists($cms_type, $this->index->getContentTypes()['cms_type_suffix'])) { return array(); } } diff --git a/conlite/classes/contenido/class.articlelanguage.php b/conlite/classes/contenido/class.articlelanguage.php index 3f12816..3327008 100644 --- a/conlite/classes/contenido/class.articlelanguage.php +++ b/conlite/classes/contenido/class.articlelanguage.php @@ -1,4 +1,5 @@ select($select); } } - + public function getIdArtLang($iIdart, $iIdlang) { $this->setWhere('idart', Contenido_Security::toInteger($iIdart)); $this->setWhere('idlang', Contenido_Security::toInteger($iIdlang)); - if($this->query() && $this->count() > 0) { + if ($this->query() && $this->count() > 0) { return $this->next()->get('idartlang'); } return false; } + } +class cApiArticleLanguage extends Item { -class cApiArticleLanguage extends Item -{ /** - * Constructor Function - * @param mixed $mId Specifies the ID of item to load + * + * @global type $cfg + * @param type $mId */ - public function __construct($mId = false) - { + public function __construct($mId = false) { global $cfg; parent::__construct($cfg["tab"]["art_lang"], "idartlang"); $this->setFilters(array(), array()); @@ -63,24 +62,23 @@ class cApiArticleLanguage extends Item $this->loadByPrimaryKey($mId); } } - + public function loadByArticleAndLanguageId($idart, $idlang) { $result = true; - if (!$this->isLoaded()) { + if (!$this->isLoaded()) { $idartlang = $this->_getIdArtLang($idart, $idlang); $result = $this->loadByPrimaryKey($idartlang); } return $result; } - - + protected function _getIdArtLang($idart, $idlang) { $sql = sprintf('SELECT idartlang FROM `%s` WHERE idart = %d AND idlang = %d', cRegistry::getConfigValue('tab', 'art_lang'), $idart, $idlang); $this->db->query($sql); $this->db->next_record(); return $this->db->f('idartlang'); } - + public function getContent($type = '', $id = NULL) { if (NULL === $this->content) { $this->_loadArticleContent(); @@ -108,16 +106,16 @@ class cApiArticleLanguage extends Item // return String return (isset($this->content[$type][$id])) ? $this->content[$type][$id] : ''; } - + protected function _loadArticleContent() { if (NULL !== $this->content) { return; } - $sql = "SELECT b.type, a.typeid, a.value FROM `".cRegistry::getConfigValue('tab', 'content') - ."` AS a, `".cRegistry::getConfigValue('tab', 'type') - ."` AS b WHERE a.idartlang = ".$this->get('idartlang') - ." AND b.idtype = a.idtype ORDER BY a.idtype, a.typeid"; + $sql = "SELECT b.type, a.typeid, a.value FROM `" . cRegistry::getConfigValue('tab', 'content') + . "` AS a, `" . cRegistry::getConfigValue('tab', 'type') + . "` AS b WHERE a.idartlang = " . $this->get('idartlang') + . " AND b.idtype = a.idtype ORDER BY a.idtype, a.typeid"; $this->db->query($sql); @@ -126,5 +124,7 @@ class cApiArticleLanguage extends Item $this->content[strtolower($this->db->f('type'))][$this->db->f('typeid')] = urldecode($this->db->f('value')); } } + } + ?> \ No newline at end of file diff --git a/conlite/includes/functions.con2.php b/conlite/includes/functions.con2.php index b424656..51cf99d 100644 --- a/conlite/includes/functions.con2.php +++ b/conlite/includes/functions.con2.php @@ -619,48 +619,30 @@ function conSetMetaValue($idartlang, $idmetatype, $value) { } /** - * (re)generate keywords for all articles of a given client (with specified language) - * @param $client Client - * @param $lang Language of a client - * @return void - * - * @author Willi Man - * Created : 12.05.2004 - * Modified : 13.05.2004 - * @copyright four for business AG 2003 + * + * @param int $client + * @param int $lang */ -function conGenerateKeywords($client, $lang) { - global $cfg; - $db_art = new DB_ConLite; +function conGenerateKeywords(int $client = null, int $lang = null) { + $aOptions = []; + $aOptions['start'] = true; + $aOptions['offline'] = true; + $aOptions['client'] = $client ?? 0; + $aOptions['lang'] = $lang ?? 0; - $options = array("img", "link", "linktarget", "swf"); // cms types to be excluded from indexing - - $sql = "SELECT - a.idart, b.idartlang - FROM - " . $cfg["tab"]["art"] . " AS a, - " . $cfg["tab"]["art_lang"] . " AS b - WHERE - a.idart = b.idart AND - a.idclient = " . Contenido_Security::escapeDB($client, $db) . " AND - b.idlang = " . Contenido_Security::escapeDB($lang, $db); - - $db_art->query($sql); - - $articles = array(); - while ($db_art->next_record()) { - $articles[$db_art->f("idart")] = $db_art->f("idartlang"); - } - - if (count($articles) > 0) { - foreach ($articles as $artid => $article_lang) { - $article_content = array(); - $article_content = conGetContentFromArticle($article_lang); - - if (count($article_content) > 0) { - $art_index = new Index($db_art); - $art_index->lang = $lang; - $art_index->start($artid, $article_content, 'auto', $options); + $oArticleCollector = new cArticleCollector(); + $oArticleCollector->setOptions($aOptions); + $oArticleCollector->loadArticles(); + /* @var $oArticle cApiArticleLanguage */ + if ($oArticleCollector->count() > 0) { + foreach ($oArticleCollector as $oArticle) { + $aArticleContent = []; + $aArticleContent = $oArticle->getContent(); + if(!empty($aArticleContent)) { + /* @var $oIndex Index */ + $oIndex = new Index(); + //$oIndex->setDebug(true); + $oIndex->start($oArticle->get('idart'), $aArticleContent, 'auto', array("img", "link", "linktarget", "swf")); } } } diff --git a/conlite/includes/functions.general.php b/conlite/includes/functions.general.php index 8c190c5..f4cb963 100644 --- a/conlite/includes/functions.general.php +++ b/conlite/includes/functions.general.php @@ -2215,7 +2215,6 @@ function clHtmlEntityDecode(string $value, ?int $flags = ENT_QUOTES | ENT_SUBSTI * @return string Returns the converted string */ function clHtmlEntities(string $value,?int $flags = ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML401, string $encoding = 'UTF-8') { - var_dump($flags); return htmlentities($value, $flags, $encoding); }