From 16224a7001abd7d5e826227731a92c1ef8ce255f Mon Sep 17 00:00:00 2001 From: Michael Date: Mon, 5 Oct 2020 12:50:18 +0000 Subject: [PATCH 1/3] Improve plaintext generation for language detection --- src/Content/Text/BBCode.php | 13 +++++++++++++ src/Model/Item.php | 12 +++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/Content/Text/BBCode.php b/src/Content/Text/BBCode.php index 5b22746ce6..1b0fa9c740 100644 --- a/src/Content/Text/BBCode.php +++ b/src/Content/Text/BBCode.php @@ -1220,6 +1220,19 @@ class BBCode return $return; } + public static function removeLinks(string $bbcode) + { + $bbcode = preg_replace("/\[img\=([0-9]*)x([0-9]*)\](.*?)\[\/img\]/ism", ' ', $bbcode); + $bbcode = preg_replace("/\[img\=(.*?)\](.*?)\[\/img\]/ism", ' $1 ', $bbcode); + $bbcode = preg_replace("/\[img\](.*?)\[\/img\]/ism", ' ', $bbcode); + + $bbcode = preg_replace('/([@!#])\[url\=(.*?)\](.*?)\[\/url\]/ism', '', $bbcode); + $bbcode = preg_replace("/\[url\](.*?)\[\/url\]/ism", ' ', $bbcode); + $bbcode = preg_replace("/\[url=[^\[\]]*\](.*)\[\/url\]/Usi", ' $1 ', $bbcode); + $bbcode = preg_replace("/\[url\](.*?)\[\/url\]/ism", ' ', $bbcode); + return $bbcode; + } + /** * Converts a BBCode message to HTML message * diff --git a/src/Model/Item.php b/src/Model/Item.php index dfea296815..d53933ba78 100644 --- a/src/Model/Item.php +++ b/src/Model/Item.php @@ -2476,7 +2476,17 @@ class Item return ''; } - $naked_body = BBCode::toPlaintext($item['body'], false); + // Convert attachments to links + $naked_body = BBCode::removeAttachment($item['body']); + + // Remove links and pictures + $naked_body = BBCode::removeLinks($naked_body); + + // Convert the title and the body to plain text + $naked_body = trim($item['title'] . "\n" . BBCode::toPlaintext($naked_body)); + + // Remove possibly remaining links + $naked_body = preg_replace(Strings::autoLinkRegEx(), '', $naked_body); $ld = new Language(); $languages = $ld->detect($naked_body)->limit(0, 3)->close(); From 397f239abbcd554fc8d0f7e9a761a9498ae2995d Mon Sep 17 00:00:00 2001 From: Michael Vogel Date: Mon, 5 Oct 2020 17:40:06 +0200 Subject: [PATCH 2/3] Apply suggestions from code review Co-authored-by: Hypolite Petovan --- src/Content/Text/BBCode.php | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/Content/Text/BBCode.php b/src/Content/Text/BBCode.php index 1b0fa9c740..94b1c35d1b 100644 --- a/src/Content/Text/BBCode.php +++ b/src/Content/Text/BBCode.php @@ -1222,14 +1222,11 @@ class BBCode public static function removeLinks(string $bbcode) { - $bbcode = preg_replace("/\[img\=([0-9]*)x([0-9]*)\](.*?)\[\/img\]/ism", ' ', $bbcode); $bbcode = preg_replace("/\[img\=(.*?)\](.*?)\[\/img\]/ism", ' $1 ', $bbcode); - $bbcode = preg_replace("/\[img\](.*?)\[\/img\]/ism", ' ', $bbcode); + $bbcode = preg_replace("/\[img.*?\[\/img\]/ism", ' ', $bbcode); - $bbcode = preg_replace('/([@!#])\[url\=(.*?)\](.*?)\[\/url\]/ism', '', $bbcode); - $bbcode = preg_replace("/\[url\](.*?)\[\/url\]/ism", ' ', $bbcode); $bbcode = preg_replace("/\[url=[^\[\]]*\](.*)\[\/url\]/Usi", ' $1 ', $bbcode); - $bbcode = preg_replace("/\[url\](.*?)\[\/url\]/ism", ' ', $bbcode); + $bbcode = preg_replace('/[@!#]?\[url.*?\[\/url\]/ism', '', $bbcode); return $bbcode; } From 20652870b6dde1a071d58e9850921e0a3fd1a6be Mon Sep 17 00:00:00 2001 From: Michael Vogel Date: Tue, 6 Oct 2020 04:55:28 +0200 Subject: [PATCH 3/3] Update src/Content/Text/BBCode.php Co-authored-by: Hypolite Petovan --- src/Content/Text/BBCode.php | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Content/Text/BBCode.php b/src/Content/Text/BBCode.php index 94b1c35d1b..594cd9a6ce 100644 --- a/src/Content/Text/BBCode.php +++ b/src/Content/Text/BBCode.php @@ -1225,6 +1225,7 @@ class BBCode $bbcode = preg_replace("/\[img\=(.*?)\](.*?)\[\/img\]/ism", ' $1 ', $bbcode); $bbcode = preg_replace("/\[img.*?\[\/img\]/ism", ' ', $bbcode); + $bbcode = preg_replace('/[@!#]\[url\=.*?\].*?\[\/url\]/ism', '', $bbcode); $bbcode = preg_replace("/\[url=[^\[\]]*\](.*)\[\/url\]/Usi", ' $1 ', $bbcode); $bbcode = preg_replace('/[@!#]?\[url.*?\[\/url\]/ism', '', $bbcode); return $bbcode;