parse_url: Further improvements of the new method to fetch page data
This commit is contained in:
parent
09034ce0ee
commit
02a1fc9cd0
2 changed files with 66 additions and 41 deletions
|
@ -1727,5 +1727,6 @@ notifications/follow
|
||||||
notifications/leave
|
notifications/leave
|
||||||
blocks/exists
|
blocks/exists
|
||||||
blocks/blocking
|
blocks/blocking
|
||||||
|
lists
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,4 @@
|
||||||
<?php
|
<?php
|
||||||
require_once('include/Photo.php');
|
|
||||||
|
|
||||||
if(!function_exists('deletenode')) {
|
if(!function_exists('deletenode')) {
|
||||||
function deletenode(&$doc, $node)
|
function deletenode(&$doc, $node)
|
||||||
{
|
{
|
||||||
|
@ -11,6 +9,30 @@ if(!function_exists('deletenode')) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function completeurl($url, $scheme) {
|
||||||
|
$urlarr = parse_url($url);
|
||||||
|
|
||||||
|
if (isset($urlarr["scheme"]))
|
||||||
|
return($url);
|
||||||
|
|
||||||
|
$schemearr = parse_url($scheme);
|
||||||
|
|
||||||
|
$complete = $schemearr["scheme"]."://".$schemearr["host"];
|
||||||
|
|
||||||
|
if ($schemearr["port"] != "")
|
||||||
|
$complete .= ":".$schemearr["port"];
|
||||||
|
|
||||||
|
$complete .= $urlarr["path"];
|
||||||
|
|
||||||
|
if ($urlarr["query"] != "")
|
||||||
|
$complete .= "?".$urlarr["query"];
|
||||||
|
|
||||||
|
if ($urlarr["fragment"] != "")
|
||||||
|
$complete .= "#".$urlarr["fragment"];
|
||||||
|
|
||||||
|
return($complete);
|
||||||
|
}
|
||||||
|
|
||||||
function parseurl_getsiteinfo($url) {
|
function parseurl_getsiteinfo($url) {
|
||||||
$siteinfo = array();
|
$siteinfo = array();
|
||||||
|
|
||||||
|
@ -25,7 +47,8 @@ function parseurl_getsiteinfo($url) {
|
||||||
$header = curl_exec($ch);
|
$header = curl_exec($ch);
|
||||||
curl_close($ch);
|
curl_close($ch);
|
||||||
|
|
||||||
if (preg_match('/charset=(.*?)\n/', $header, $matches))
|
// Fetch the first mentioned charset. Can be in body or header
|
||||||
|
if (preg_match('/charset=(.*?)['."'".'"\s\n]/', $header, $matches))
|
||||||
$charset = trim(array_pop($matches));
|
$charset = trim(array_pop($matches));
|
||||||
else
|
else
|
||||||
$charset = "utf-8";
|
$charset = "utf-8";
|
||||||
|
@ -57,11 +80,13 @@ function parseurl_getsiteinfo($url) {
|
||||||
|
|
||||||
$xpath = new DomXPath($doc);
|
$xpath = new DomXPath($doc);
|
||||||
|
|
||||||
$list = $xpath->query("head/title");
|
//$list = $xpath->query("head/title");
|
||||||
|
$list = $xpath->query("//title");
|
||||||
foreach ($list as $node)
|
foreach ($list as $node)
|
||||||
$siteinfo["title"] = html_entity_decode($node->nodeValue, ENT_QUOTES, "UTF-8");
|
$siteinfo["title"] = html_entity_decode($node->nodeValue, ENT_QUOTES, "UTF-8");
|
||||||
|
|
||||||
$list = $xpath->query("head/meta[@name]");
|
//$list = $xpath->query("head/meta[@name]");
|
||||||
|
$list = $xpath->query("//meta[@name]");
|
||||||
foreach ($list as $node) {
|
foreach ($list as $node) {
|
||||||
$attr = array();
|
$attr = array();
|
||||||
if ($node->attributes->length)
|
if ($node->attributes->length)
|
||||||
|
@ -86,7 +111,8 @@ function parseurl_getsiteinfo($url) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
$list = $xpath->query("head/meta[@property]");
|
//$list = $xpath->query("head/meta[@property]");
|
||||||
|
$list = $xpath->query("//meta[@property]");
|
||||||
foreach ($list as $node) {
|
foreach ($list as $node) {
|
||||||
$attr = array();
|
$attr = array();
|
||||||
if ($node->attributes->length)
|
if ($node->attributes->length)
|
||||||
|
@ -116,38 +142,32 @@ function parseurl_getsiteinfo($url) {
|
||||||
foreach ($node->attributes as $attribute)
|
foreach ($node->attributes as $attribute)
|
||||||
$attr[$attribute->name] = $attribute->value;
|
$attr[$attribute->name] = $attribute->value;
|
||||||
|
|
||||||
// guess mimetype from headers or filename
|
$src = completeurl($attr["src"], $url);
|
||||||
$type = guess_image_type($attr["src"],true);
|
$photodata = getimagesize($src);
|
||||||
|
|
||||||
$i = fetch_url($attr["src"]);
|
if (($photodata[0] > 150) and ($photodata[1] > 150)) {
|
||||||
$ph = new Photo($i, $type);
|
if ($photodata[0] > 300) {
|
||||||
|
$photodata[1] = $photodata[1] * (300 / $photodata[0]);
|
||||||
if($ph->is_valid() and ($ph->getWidth() > 200) and ($ph->getHeight() > 200)) {
|
$photodata[0] = 300;
|
||||||
if ($siteinfo["image"] == "")
|
}
|
||||||
$siteinfo["image"] = $attr["src"];
|
if ($photodata[1] > 300) {
|
||||||
|
$photodata[0] = $photodata[0] * (300 / $photodata[1]);
|
||||||
if($ph->getWidth() > 300 || $ph->getHeight() > 300) {
|
$photodata[1] = 300;
|
||||||
$ph->scaleImage(300);
|
}
|
||||||
$siteinfo["images"][] = array("src"=>$attr["src"],
|
$siteinfo["images"][] = array("src"=>$src,
|
||||||
"width"=>$ph->getWidth(),
|
"width"=>$photodata[0],
|
||||||
"height"=>$ph->getHeight());
|
"height"=>$photodata[1]);
|
||||||
} else
|
|
||||||
$siteinfo["images"][] = array("src"=>$attr["src"],
|
|
||||||
"width"=>$ph->getWidth(),
|
|
||||||
"height"=>$ph->getHeight());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// guess mimetype from headers or filename
|
$src = completeurl($siteinfo["image"], $url);
|
||||||
$type = guess_image_type($siteinfo["image"],true);
|
$photodata = getimagesize($src);
|
||||||
|
|
||||||
$i = fetch_url($siteinfo["image"]);
|
if (($photodata[0] > 10) and ($photodata[1] > 10))
|
||||||
$ph = new Photo($i, $type);
|
$siteinfo["images"][] = array("src"=>$src,
|
||||||
|
"width"=>$photodata[0],
|
||||||
if($ph->is_valid())
|
"height"=>$photodata[1]);
|
||||||
$siteinfo["images"][] = array("src"=>$siteinfo["image"],
|
|
||||||
"width"=>$ph->getWidth(),
|
|
||||||
"height"=>$ph->getHeight());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($siteinfo["text"] == "") {
|
if ($siteinfo["text"] == "") {
|
||||||
|
@ -155,19 +175,22 @@ function parseurl_getsiteinfo($url) {
|
||||||
|
|
||||||
$list = $xpath->query("//div[@class='article']");
|
$list = $xpath->query("//div[@class='article']");
|
||||||
foreach ($list as $node)
|
foreach ($list as $node)
|
||||||
$text .= " ".trim($node->nodeValue);
|
if (strlen($node->nodeValue) > 40)
|
||||||
|
$text .= " ".trim($node->nodeValue);
|
||||||
|
|
||||||
if ($text == "") {
|
if ($text == "") {
|
||||||
$list = $xpath->query("//div[@class='content']");
|
$list = $xpath->query("//div[@class='content']");
|
||||||
foreach ($list as $node)
|
foreach ($list as $node)
|
||||||
$text .= " ".trim($node->nodeValue);
|
if (strlen($node->nodeValue) > 40)
|
||||||
|
$text .= " ".trim($node->nodeValue);
|
||||||
}
|
}
|
||||||
|
|
||||||
// If none text was found then take the paragraph content
|
// If none text was found then take the paragraph content
|
||||||
if ($text == "") {
|
if ($text == "") {
|
||||||
$list = $xpath->query("//p");
|
$list = $xpath->query("//p");
|
||||||
foreach ($list as $node)
|
foreach ($list as $node)
|
||||||
$text .= " ".trim($node->nodeValue);
|
if (strlen($node->nodeValue) > 40)
|
||||||
|
$text .= " ".trim($node->nodeValue);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($text != "") {
|
if ($text != "") {
|
||||||
|
@ -238,9 +261,9 @@ function parse_url_content(&$a) {
|
||||||
if($url && $title && $text) {
|
if($url && $title && $text) {
|
||||||
|
|
||||||
if($textmode)
|
if($textmode)
|
||||||
$text = $br . $br . '[quote]' . trim($text) . '[/quote]' . $br;
|
$text = $br . '[quote]' . trim($text) . '[/quote]' . $br;
|
||||||
else
|
else
|
||||||
$text = '<br /><br /><blockquote>' . trim($text) . '</blockquote><br />';
|
$text = '<br /><blockquote>' . trim($text) . '</blockquote><br />';
|
||||||
|
|
||||||
$title = str_replace(array("\r","\n"),array('',''),$title);
|
$title = str_replace(array("\r","\n"),array('',''),$title);
|
||||||
|
|
||||||
|
@ -255,7 +278,8 @@ function parse_url_content(&$a) {
|
||||||
$siteinfo = parseurl_getsiteinfo($url);
|
$siteinfo = parseurl_getsiteinfo($url);
|
||||||
|
|
||||||
if($siteinfo["title"] == "") {
|
if($siteinfo["title"] == "") {
|
||||||
echo sprintf($template,$url,$url,'') . $str_tags;
|
echo print_r($siteinfo, true);
|
||||||
|
//echo sprintf($template,$url,$url,'') . $str_tags;
|
||||||
killme();
|
killme();
|
||||||
} else {
|
} else {
|
||||||
$text = $siteinfo["text"];
|
$text = $siteinfo["text"];
|
||||||
|
@ -305,7 +329,7 @@ function parse_url_content(&$a) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if($image) {
|
if($image) {
|
||||||
$text = $br.$br.$image.$br.$text;
|
$text = $br.$br.$image.$text;
|
||||||
}
|
}
|
||||||
$title = str_replace(array("\r","\n"),array('',''),$title);
|
$title = str_replace(array("\r","\n"),array('',''),$title);
|
||||||
|
|
||||||
|
@ -313,6 +337,6 @@ function parse_url_content(&$a) {
|
||||||
|
|
||||||
logger('parse_url: returns: ' . $result);
|
logger('parse_url: returns: ' . $result);
|
||||||
|
|
||||||
echo $result;
|
echo trim($result);
|
||||||
killme();
|
killme();
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue