The language detection is now done in blocks
This commit is contained in:
parent
1723aa0697
commit
f4591b2cc7
3 changed files with 85 additions and 31 deletions
|
@ -228,6 +228,7 @@ Called after the language detection. This can be used for alternative language d
|
||||||
- **text**: The text that is analyzed.
|
- **text**: The text that is analyzed.
|
||||||
- **detected**: (input/output) Array of language codes detected in the related text. The array key is the language code, the array value the probability.
|
- **detected**: (input/output) Array of language codes detected in the related text. The array key is the language code, the array value the probability.
|
||||||
- **uri-id**: The Uri-Id of the item.
|
- **uri-id**: The Uri-Id of the item.
|
||||||
|
- **author-id**: The id of the author contact.
|
||||||
|
|
||||||
### addon_settings
|
### addon_settings
|
||||||
Called when generating the HTML for the addon settings page.
|
Called when generating the HTML for the addon settings page.
|
||||||
|
|
|
@ -110,6 +110,7 @@ Dieser Hook kann dafür verwendet werden, alternative Erkennungsfunktionen einzu
|
||||||
'text' => Der analysierte Text.
|
'text' => Der analysierte Text.
|
||||||
'detected' => (Eingabe/Ausgabe) Das Array mit den erkannten Sprachen. Der Sprachcode ist der Array-Schlüssel, der Array-Wert ist der dezimale Wert für die Wahrscheinlichkeit.
|
'detected' => (Eingabe/Ausgabe) Das Array mit den erkannten Sprachen. Der Sprachcode ist der Array-Schlüssel, der Array-Wert ist der dezimale Wert für die Wahrscheinlichkeit.
|
||||||
'uri-id' => Die Uri-Id des Beitrags
|
'uri-id' => Die Uri-Id des Beitrags
|
||||||
|
'author-id' => Die Contact-id des Autors.
|
||||||
|
|
||||||
**'addon_settings'** - wird aufgerufen, wenn die HTML-Ausgabe der Addon-Einstellungsseite generiert wird.
|
**'addon_settings'** - wird aufgerufen, wenn die HTML-Ausgabe der Addon-Einstellungsseite generiert wird.
|
||||||
$b ist die HTML-Ausgabe (String) der Addon-Einstellungsseite vor dem finalen "</form>"-Tag.
|
$b ist die HTML-Ausgabe (String) der Addon-Einstellungsseite vor dem finalen "</form>"-Tag.
|
||||||
|
|
|
@ -49,6 +49,7 @@ use Friendica\Util\Proxy;
|
||||||
use Friendica\Util\Strings;
|
use Friendica\Util\Strings;
|
||||||
use Friendica\Util\Temporal;
|
use Friendica\Util\Temporal;
|
||||||
use GuzzleHttp\Psr7\Uri;
|
use GuzzleHttp\Psr7\Uri;
|
||||||
|
use IntlChar;
|
||||||
use LanguageDetection\Language;
|
use LanguageDetection\Language;
|
||||||
|
|
||||||
class Item
|
class Item
|
||||||
|
@ -2010,67 +2011,118 @@ class Item
|
||||||
*/
|
*/
|
||||||
public static function getLanguageArray(string $body, int $count, int $uri_id = 0, int $author_id = 0): array
|
public static function getLanguageArray(string $body, int $count, int $uri_id = 0, int $author_id = 0): array
|
||||||
{
|
{
|
||||||
$naked_body = BBCode::toSearchText($body, $uri_id);
|
$searchtext = BBCode::toSearchText($body, $uri_id);
|
||||||
|
|
||||||
if ((count(explode(' ', $naked_body)) < 10) && (mb_strlen($naked_body) < 30) && $author_id) {
|
if ((count(explode(' ', $searchtext)) < 10) && (mb_strlen($searchtext) < 30) && $author_id) {
|
||||||
$author = Contact::selectFirst(['about'], ['id' => $author_id]);
|
$author = Contact::selectFirst(['about'], ['id' => $author_id]);
|
||||||
if (!empty($author['about'])) {
|
if (!empty($author['about'])) {
|
||||||
$about = BBCode::toSearchText($author['about'], 0);
|
$about = BBCode::toSearchText($author['about'], 0);
|
||||||
$about = self::getDominantLanguage($about);
|
Logger::debug('About field added', ['author' => $author_id, 'body' => $searchtext, 'about' => $about]);
|
||||||
Logger::debug('About field added', ['author' => $author_id, 'body' => $naked_body, 'about' => $about]);
|
$searchtext .= ' ' . $about;
|
||||||
$naked_body .= ' ' . $about;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (empty($naked_body)) {
|
if (empty($searchtext)) {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
$naked_body = self::getDominantLanguage($naked_body);
|
|
||||||
|
|
||||||
$availableLanguages = DI::l10n()->getAvailableLanguages(true);
|
$availableLanguages = DI::l10n()->getAvailableLanguages(true);
|
||||||
$availableLanguages = DI::l10n()->convertForLanguageDetection($availableLanguages);
|
$availableLanguages = DI::l10n()->convertForLanguageDetection($availableLanguages);
|
||||||
|
|
||||||
$ld = new Language(array_keys($availableLanguages));
|
$ld = new Language(array_keys($availableLanguages));
|
||||||
$languages = $ld->detect($naked_body)->limit(0, $count)->close() ?: [];
|
|
||||||
|
|
||||||
$data = [
|
$result = [];
|
||||||
'text' => $naked_body,
|
|
||||||
'detected' => $languages,
|
|
||||||
'uri-id' => $uri_id,
|
|
||||||
];
|
|
||||||
|
|
||||||
Hook::callAll('detect_languages', $data);
|
foreach (self::splitByBlocks($searchtext) as $block) {
|
||||||
$languages = $data['detected'];
|
$languages = $ld->detect($block)->limit(0, $count)->close() ?: [];
|
||||||
|
|
||||||
return $languages;
|
$data = [
|
||||||
|
'text' => $block,
|
||||||
|
'detected' => $languages,
|
||||||
|
'uri-id' => $uri_id,
|
||||||
|
'author-id' => $author_id,
|
||||||
|
];
|
||||||
|
Hook::callAll('detect_languages', $data);
|
||||||
|
|
||||||
|
foreach ($data['detected'] as $language => $quality) {
|
||||||
|
$result[$language] = max($result[$language] ?? 0, $quality * (strlen($block) / strlen($searchtext)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
arsort($result);
|
||||||
|
$result = array_slice($result, 0, $count);
|
||||||
|
|
||||||
|
return $result;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check if latin or non latin are dominant in the body and only return the dominant one
|
* Split a string into different unicode blocks
|
||||||
|
* Currently the text is split into the latin and the non latin part.
|
||||||
*
|
*
|
||||||
* @param string $body
|
* @param string $body
|
||||||
* @return string
|
* @return array
|
||||||
*/
|
*/
|
||||||
private static function getDominantLanguage(string $body): string
|
private static function splitByBlocks(string $body): array
|
||||||
{
|
{
|
||||||
$latin = '';
|
$blocks = [];
|
||||||
$non_latin = '';
|
$previous_block = 0;
|
||||||
|
|
||||||
for ($i = 0; $i < mb_strlen($body); $i++) {
|
for ($i = 0; $i < mb_strlen($body); $i++) {
|
||||||
$character = mb_substr($body, $i, 1);
|
$character = mb_substr($body, $i, 1);
|
||||||
$ord = mb_ord($character);
|
$previous = ($i > 0) ? mb_substr($body, $i - 1, 1) : '';
|
||||||
|
$next = ($i < mb_strlen($body)) ? mb_substr($body, $i + 1, 1) : '';
|
||||||
|
|
||||||
// We add the most common characters to both strings.
|
if (!IntlChar::isalpha($character)) {
|
||||||
if (($ord <= 64) || ($ord >= 91 && $ord <= 96) || ($ord >= 123 && $ord <= 191) || in_array($ord, [215, 247]) || ($ord >= 697 && $ord <= 735) || ($ord > 65535)) {
|
if (($previous != '') && (IntlChar::isalpha($previous))) {
|
||||||
$latin .= $character;
|
$previous_block = self::getBlockCode($previous);
|
||||||
$non_latin .= $character;
|
}
|
||||||
} elseif ($ord < 768) {
|
|
||||||
$latin .= $character;
|
$block = (($next != '') && IntlChar::isalpha($next)) ? self::getBlockCode($next) : $previous_block;
|
||||||
|
$blocks[$block] = ($blocks[$block] ?? '') . $character;
|
||||||
} else {
|
} else {
|
||||||
$non_latin .= $character;
|
$block = self::getBlockCode($character);
|
||||||
|
$blocks[$block] = ($blocks[$block] ?? '') . $character;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return (mb_strlen($latin) > mb_strlen($non_latin)) ? $latin : $non_latin;
|
|
||||||
|
foreach (array_keys($blocks) as $key) {
|
||||||
|
$blocks[$key] = trim($blocks[$key]);
|
||||||
|
if (empty($blocks[$key])) {
|
||||||
|
unset($blocks[$key]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return array_values($blocks);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* returns the block code for the given character
|
||||||
|
*
|
||||||
|
* @param string $character
|
||||||
|
* @return integer 0 = no alpha character (blank, signs, emojis, ...), 1 = latin character, 2 = character in every other language
|
||||||
|
*/
|
||||||
|
private static function getBlockCode(string $character): int
|
||||||
|
{
|
||||||
|
if (!IntlChar::isalpha($character)) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return self::isLatin($character) ? 1 : 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks if the given character is in one of the latin code blocks
|
||||||
|
*
|
||||||
|
* @param string $character
|
||||||
|
* @return boolean
|
||||||
|
*/
|
||||||
|
private static function isLatin(string $character): bool
|
||||||
|
{
|
||||||
|
return in_array(IntlChar::getBlockCode($character), [
|
||||||
|
IntlChar::BLOCK_CODE_BASIC_LATIN, IntlChar::BLOCK_CODE_LATIN_1_SUPPLEMENT,
|
||||||
|
IntlChar::BLOCK_CODE_LATIN_EXTENDED_A, IntlChar::BLOCK_CODE_LATIN_EXTENDED_B,
|
||||||
|
IntlChar::BLOCK_CODE_LATIN_EXTENDED_C, IntlChar::BLOCK_CODE_LATIN_EXTENDED_D,
|
||||||
|
IntlChar::BLOCK_CODE_LATIN_EXTENDED_E, IntlChar::BLOCK_CODE_LATIN_EXTENDED_ADDITIONAL
|
||||||
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static function getLanguageMessage(array $item): string
|
public static function getLanguageMessage(array $item): string
|
||||||
|
|
Loading…
Reference in a new issue