bugfixes search and keywords generation

This commit is contained in:
o.pinke 2023-02-22 15:34:23 +01:00
parent 434b483fac
commit 6729446154
5 changed files with 110 additions and 89 deletions

View file

@ -34,7 +34,7 @@ class cArticleCollector implements SeekableIterator, Countable {
protected $_aStartArticles = array();
protected $_aOptions = array();
protected $_aOptionsDefault = array();
private $_bAsObject = TRUE;
private $_bAsObject = true;
/**
*
@ -65,14 +65,12 @@ class cArticleCollector implements SeekableIterator, Countable {
}
if (count($this->_aStartArticles) > 0) {
print_r($this->_aStartArticles);
if ($this->_aOptions['start'] == false) {
$oArtLangColl->setWhere("cApiArticleLanguageCollection.idartlang", $this->_aStartArticles, "NOTIN");
//$sqlStartArticles = "a.idartlang NOT IN ('" . implode("','", $this->_startArticles) . "') AND ";
}
if ($this->_aOptions['startonly'] == true) {
echo "startonly";
$oArtLangColl->setWhere("cApiArticleLanguageCollection.idartlang", $this->_aStartArticles, "IN");
//$sqlStartArticles = "a.idartlang IN ('" . implode("','", $this->_startArticles) . "') AND ";
}
@ -89,7 +87,6 @@ class cArticleCollector implements SeekableIterator, Countable {
$oArtLangColl->setWhere("cApiArticleLanguageCollection.idlang", $this->_aOptions['lang']);
$oArtLangColl->query();
echo $oArtLangColl->_lastSQL;
if ($oArtLangColl->count() > 0) {
$aTable = $oArtLangColl->fetchTable();
//echo $oArtLangColl->_lastSQL;
@ -97,7 +94,6 @@ class cArticleCollector implements SeekableIterator, Countable {
foreach ($aTable as $aItem) {
$this->_aArticles[] = $aItem['idartlang'];
}
print_r($this->_aArticles);
}
}
@ -162,7 +158,7 @@ class cArticleCollector implements SeekableIterator, Countable {
*
* @return cApiArticleLanguage|int returns article language object or idartlang
*/
public function current() {
public function current() :cApiArticleLanguage|int{
$iIdartlang = $this->_aArticles[$this->_iCurrentPosition];
if ($this->_bAsObject) {
$oArticle = new cApiArticleLanguage($iIdartlang);

View file

@ -225,13 +225,13 @@ class Index extends SearchBaseAbstract {
*
* @var array
*/
var $cms_type = array();
protected static $_cms_type = [];
/**
* the suffix of all available cms types
* @var array
*/
var $cms_type_suffix = array();
protected static $_cms_type_suffix = [];
/**
* Constructor, set object properties
@ -270,6 +270,8 @@ class Index extends SearchBaseAbstract {
$this->idart = $idart;
}
$this->_debug('Start Index for ', $this->idart);
$this->place = $place;
$this->keycode = $aContent;
$this->setStopwords($aStopwords);
@ -283,7 +285,14 @@ class Index extends SearchBaseAbstract {
$old_keys = array_keys($this->keywords_old);
$this->keywords_del = array_diff($old_keys, $new_keys);
/*
echo '<pre>';
print_r($new_keys);
print_r($old_keys);
print_r($this->keywords_del);
echo '</pre>';
*
*/
if (count($this->keywords_del) > 0) {
$this->deleteKeywords();
}
@ -312,7 +321,7 @@ class Index extends SearchBaseAbstract {
foreach ($this->keycode as $idtype => $data) {
if ($this->checkCmsType($idtype)) {
foreach ($data as $typeid => $code) {
$this->_debug('code', $code);
$this->_debug('createKeywords: raw code from data array', $code);
$code = stripslashes($code); // remove backslash
$code = str_ireplace(array('<br>', '<br />'), "\n", $code); // replace HTML line breaks with newlines
@ -320,13 +329,18 @@ class Index extends SearchBaseAbstract {
if (strlen($code) > 0) {
$code = clHtmlEntityDecode($code);
}
$this->_debug('code', $code);
$this->_debug('createKeywords: code after clean', $code);
$tmp_keys = preg_split('/[\s,]+/', trim($code)); // split content by any number of commas or space characters
$this->_debug('tmp_keys', $tmp_keys);
$this->_debug('createKeywords: tmp_keys', $tmp_keys);
foreach ($tmp_keys as $value) {
$value = strtolower($value); // index terms are stored with lower case
$value = preg_replace('/[^\w]+/u', '', $value);
if (empty(trim($value))) {
continue;
}
if (!in_array($value, $this->stopwords)) {
// eliminate stopwords
@ -335,6 +349,7 @@ class Index extends SearchBaseAbstract {
if (strlen($value) > 1) {
// do not index single characters
$this->keywords[$value] = $this->keywords[$value] . $idtype . '-' . $typeid . ' ';
$this->_debug('createKeywords: entry array keywords', $this->keywords);
}
}
}
@ -345,7 +360,7 @@ class Index extends SearchBaseAbstract {
}
}
$this->_debug('keywords', $this->keywords);
$this->_debug('createKeywords: keywords returned', $this->keywords);
}
/**
@ -357,9 +372,10 @@ class Index extends SearchBaseAbstract {
$tmp_count = array();
foreach ($this->keywords as $keyword => $count) {
$bProceed = true;
$this->_debug('keyword', $keyword);
$tmp_count = preg_split('/[\s]/', trim($count));
$this->_debug('tmp_count', $tmp_count);
$occurrence = count($tmp_count);
$tmp_count = array_unique($tmp_count);
$cms_types = implode(',', $tmp_count);
@ -376,8 +392,12 @@ class Index extends SearchBaseAbstract {
('" . Contenido_Security::escapeDB($keyword, $this->db) . "', '" . Contenido_Security::escapeDB($index_string, $this->db) . "', " . Contenido_Security::toInteger($this->lang) . ", " . Contenido_Security::toInteger($nextid) . ")";
} else {
// if keyword allready exists, create new index_string
if (preg_match("/&$this->idart=/", $this->keywords_old[$keyword])) {
$index_string = preg_replace("/&$this->idart=[0-9]+\([\w-,]+\)/", $index_string, $this->keywords_old[$keyword]);
if (preg_match("/&" . $this->idart . "=/", $this->keywords_old[$keyword])) {
$index_string = preg_replace("/&" . $this->idart . "=[0-9]+\([,\w-]+\)/", $index_string, $this->keywords_old[$keyword]);
if ($index_string === $this->keywords_old[$keyword]) {
$bProceed = false;
$this->_debug('db update', 'no update needed');
}
} else {
$index_string = $this->keywords_old[$keyword] . $index_string;
}
@ -386,9 +406,11 @@ class Index extends SearchBaseAbstract {
SET " . $this->place . " = '" . $index_string . "'
WHERE idlang='" . Contenido_Security::toInteger($this->lang) . "' AND keyword='" . Contenido_Security::escapeDB($keyword, $this->db) . "'";
}
$this->_debug('sql', $sql);
$this->db->query($sql);
if ($bProceed) {
$this->_debug('sql', $sql);
$this->db->query($sql);
}
}
}
@ -431,7 +453,7 @@ class Index extends SearchBaseAbstract {
idlang=" . Contenido_Security::toInteger($this->lang) . " AND
(keyword IN ('" . $keys . "') OR " . $this->place . " REGEXP '&" . Contenido_Security::toInteger($this->idart) . "=')";
$this->_debug('sql', $sql);
$this->_debug('getKeywords: sql', $sql);
$this->db->query($sql);
@ -440,6 +462,8 @@ class Index extends SearchBaseAbstract {
while ($this->db->next_record()) {
$this->keywords_old[$this->db->f('keyword')] = $this->db->f($place);
}
$this->_debug('getKeywords: array keywords_old', $this->keywords_old);
}
/**
@ -448,6 +472,7 @@ class Index extends SearchBaseAbstract {
* @return $key
*/
function removeSpecialChars($key) {
$aSpecialChars = array(
"-", "_", "'", ".", "!", "\"", "#", "$", "%", "&", "(", ")", "*", "+", ",", "/",
":", ";", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "`", "{", "|", "}", "~"
@ -461,6 +486,7 @@ class Index extends SearchBaseAbstract {
// a client and should not be treated in this method.
// modified 2007-10-01, H. Librenz - added as hotfix for encoding problems (doesn't find any words with
// umlaut vowels in it since you turn on UTF-8 as language encoding)
$sEncoding = getEncodingByLanguage($this->db, $this->lang, $this->cfg);
if (strtolower($sEncoding) != 'iso-8859-2') {
@ -486,6 +512,9 @@ class Index extends SearchBaseAbstract {
$key = clHtmlEntityDecode($key);
$key = str_replace($aSpecialChars, '', $key);
ini_set('mbstring.substitute_character', "none");
$key = mb_convert_encoding($key, 'UTF-8', 'UTF-8');
return $key;
}
@ -516,6 +545,21 @@ class Index extends SearchBaseAbstract {
return $key;
}
/**
*
* @return array array with arrays of type and typesuffix
*/
public function getContentTypes(): array {
if (empty(self::$_cms_type)) {
$this->setContentTypes();
}
return array(
'cms_type' => self::$_cms_type,
'cms_type_suffix' => self::$_cms_type_suffix
);
}
/**
* set the array of stopwords which should not be indexed
* @param array $aStopwords
@ -537,8 +581,8 @@ class Index extends SearchBaseAbstract {
$this->_debug('sql', $sql);
$this->db->query($sql);
while ($this->db->next_record()) {
$this->cms_type[$this->db->f('type')] = $this->db->f('idtype');
$this->cms_type_suffix[$this->db->f('idtype')] = substr($this->db->f('type'), 4, strlen($this->db->f('type')));
self::$_cms_type[$this->db->f('type')] = $this->db->f('idtype');
self::$_cms_type_suffix[$this->db->f('idtype')] = substr($this->db->f('type'), 4, strlen($this->db->f('type')));
}
}
@ -554,11 +598,11 @@ class Index extends SearchBaseAbstract {
if (strlen($opt) > 0) {
if (!stristr($opt, 'cms_')) {
if (in_array($opt, $this->cms_type_suffix)) {
if (in_array($opt, $this->getContentTypes()['cms_type_suffix'])) {
$this->cms_options[$opt] = 'CMS_' . $opt;
}
} else {
if (array_key_exists($opt, $this->cms_type)) {
if (array_key_exists($opt, $this->getContentTypes()['cms_type'])) {
$this->cms_options[$opt] = $opt;
}
}
@ -788,8 +832,8 @@ class Search extends SearchBaseAbstract {
$this->index = new Index($oDB);
$this->cms_type = $this->index->cms_type;
$this->cms_type_suffix = $this->index->cms_type_suffix;
$this->cms_type = $this->index->getContentTypes()['cms_type'];
$this->cms_type_suffix = $this->index->getContentTypes()['cms_type_suffix'];
$this->search_option = (array_key_exists('db', $options)) ? strtolower($options['db']) : 'regexp';
$this->search_combination = (array_key_exists('combine', $options)) ? strtolower($options['combine']) : 'or';
@ -1339,11 +1383,11 @@ class SearchResult extends SearchBaseAbstract {
$cms_type = strtoupper($cms_type);
if (strlen($cms_type) > 0) {
if (!stristr($cms_type, 'cms_')) {
if (in_array($cms_type, $this->index->cms_type_suffix)) {
if (in_array($cms_type, $this->index->getContentTypes()['cms_type'])) {
$cms_type = 'CMS_' . $cms_type;
}
} else {
if (!array_key_exists($cms_type, $this->index->cms_type)) {
if (!array_key_exists($cms_type, $this->index->getContentTypes()['cms_type_suffix'])) {
return array();
}
}

View file

@ -1,4 +1,5 @@
<?php
/**
* File:
* class.articlelanguage.php
@ -18,14 +19,12 @@
*
* $Id$
*/
if (!defined('CON_FRAMEWORK')) {
die('Illegal call');
}
class cApiArticleLanguageCollection extends ItemCollection {
public function __construct($select = false) {
global $cfg;
parent::__construct($cfg["tab"]["art_lang"], "idartlang");
@ -36,26 +35,26 @@ class cApiArticleLanguageCollection extends ItemCollection {
$this->select($select);
}
}
public function getIdArtLang($iIdart, $iIdlang) {
$this->setWhere('idart', Contenido_Security::toInteger($iIdart));
$this->setWhere('idlang', Contenido_Security::toInteger($iIdlang));
if($this->query() && $this->count() > 0) {
if ($this->query() && $this->count() > 0) {
return $this->next()->get('idartlang');
}
return false;
}
}
class cApiArticleLanguage extends Item {
class cApiArticleLanguage extends Item
{
/**
* Constructor Function
* @param mixed $mId Specifies the ID of item to load
*
* @global type $cfg
* @param type $mId
*/
public function __construct($mId = false)
{
public function __construct($mId = false) {
global $cfg;
parent::__construct($cfg["tab"]["art_lang"], "idartlang");
$this->setFilters(array(), array());
@ -63,24 +62,23 @@ class cApiArticleLanguage extends Item
$this->loadByPrimaryKey($mId);
}
}
public function loadByArticleAndLanguageId($idart, $idlang) {
$result = true;
if (!$this->isLoaded()) {
if (!$this->isLoaded()) {
$idartlang = $this->_getIdArtLang($idart, $idlang);
$result = $this->loadByPrimaryKey($idartlang);
}
return $result;
}
protected function _getIdArtLang($idart, $idlang) {
$sql = sprintf('SELECT idartlang FROM `%s` WHERE idart = %d AND idlang = %d', cRegistry::getConfigValue('tab', 'art_lang'), $idart, $idlang);
$this->db->query($sql);
$this->db->next_record();
return $this->db->f('idartlang');
}
public function getContent($type = '', $id = NULL) {
if (NULL === $this->content) {
$this->_loadArticleContent();
@ -108,16 +106,16 @@ class cApiArticleLanguage extends Item
// return String
return (isset($this->content[$type][$id])) ? $this->content[$type][$id] : '';
}
protected function _loadArticleContent() {
if (NULL !== $this->content) {
return;
}
$sql = "SELECT b.type, a.typeid, a.value FROM `".cRegistry::getConfigValue('tab', 'content')
."` AS a, `".cRegistry::getConfigValue('tab', 'type')
."` AS b WHERE a.idartlang = ".$this->get('idartlang')
." AND b.idtype = a.idtype ORDER BY a.idtype, a.typeid";
$sql = "SELECT b.type, a.typeid, a.value FROM `" . cRegistry::getConfigValue('tab', 'content')
. "` AS a, `" . cRegistry::getConfigValue('tab', 'type')
. "` AS b WHERE a.idartlang = " . $this->get('idartlang')
. " AND b.idtype = a.idtype ORDER BY a.idtype, a.typeid";
$this->db->query($sql);
@ -126,5 +124,7 @@ class cApiArticleLanguage extends Item
$this->content[strtolower($this->db->f('type'))][$this->db->f('typeid')] = urldecode($this->db->f('value'));
}
}
}
?>

View file

@ -619,48 +619,30 @@ function conSetMetaValue($idartlang, $idmetatype, $value) {
}
/**
* (re)generate keywords for all articles of a given client (with specified language)
* @param $client Client
* @param $lang Language of a client
* @return void
*
* @author Willi Man
* Created : 12.05.2004
* Modified : 13.05.2004
* @copyright four for business AG 2003
*
* @param int $client
* @param int $lang
*/
function conGenerateKeywords($client, $lang) {
global $cfg;
$db_art = new DB_ConLite;
function conGenerateKeywords(int $client = null, int $lang = null) {
$aOptions = [];
$aOptions['start'] = true;
$aOptions['offline'] = true;
$aOptions['client'] = $client ?? 0;
$aOptions['lang'] = $lang ?? 0;
$options = array("img", "link", "linktarget", "swf"); // cms types to be excluded from indexing
$sql = "SELECT
a.idart, b.idartlang
FROM
" . $cfg["tab"]["art"] . " AS a,
" . $cfg["tab"]["art_lang"] . " AS b
WHERE
a.idart = b.idart AND
a.idclient = " . Contenido_Security::escapeDB($client, $db) . " AND
b.idlang = " . Contenido_Security::escapeDB($lang, $db);
$db_art->query($sql);
$articles = array();
while ($db_art->next_record()) {
$articles[$db_art->f("idart")] = $db_art->f("idartlang");
}
if (count($articles) > 0) {
foreach ($articles as $artid => $article_lang) {
$article_content = array();
$article_content = conGetContentFromArticle($article_lang);
if (count($article_content) > 0) {
$art_index = new Index($db_art);
$art_index->lang = $lang;
$art_index->start($artid, $article_content, 'auto', $options);
$oArticleCollector = new cArticleCollector();
$oArticleCollector->setOptions($aOptions);
$oArticleCollector->loadArticles();
/* @var $oArticle cApiArticleLanguage */
if ($oArticleCollector->count() > 0) {
foreach ($oArticleCollector as $oArticle) {
$aArticleContent = [];
$aArticleContent = $oArticle->getContent();
if(!empty($aArticleContent)) {
/* @var $oIndex Index */
$oIndex = new Index();
//$oIndex->setDebug(true);
$oIndex->start($oArticle->get('idart'), $aArticleContent, 'auto', array("img", "link", "linktarget", "swf"));
}
}
}

View file

@ -2215,7 +2215,6 @@ function clHtmlEntityDecode(string $value, ?int $flags = ENT_QUOTES | ENT_SUBSTI
* @return string Returns the converted string
*/
function clHtmlEntities(string $value,?int $flags = ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML401, string $encoding = 'UTF-8') {
var_dump($flags);
return htmlentities($value, $flags, $encoding);
}