parse_url: Further improvements of the new method to fetch page data

This commit is contained in:
Michael Vogel 2012-07-12 23:41:04 +02:00
parent 09034ce0ee
commit 02a1fc9cd0
2 changed files with 66 additions and 41 deletions

View file

@ -1727,5 +1727,6 @@ notifications/follow
notifications/leave notifications/leave
blocks/exists blocks/exists
blocks/blocking blocks/blocking
lists
*/ */

View file

@ -1,6 +1,4 @@
<?php <?php
require_once('include/Photo.php');
if(!function_exists('deletenode')) { if(!function_exists('deletenode')) {
function deletenode(&$doc, $node) function deletenode(&$doc, $node)
{ {
@ -11,6 +9,30 @@ if(!function_exists('deletenode')) {
} }
} }
function completeurl($url, $scheme) {
$urlarr = parse_url($url);
if (isset($urlarr["scheme"]))
return($url);
$schemearr = parse_url($scheme);
$complete = $schemearr["scheme"]."://".$schemearr["host"];
if ($schemearr["port"] != "")
$complete .= ":".$schemearr["port"];
$complete .= $urlarr["path"];
if ($urlarr["query"] != "")
$complete .= "?".$urlarr["query"];
if ($urlarr["fragment"] != "")
$complete .= "#".$urlarr["fragment"];
return($complete);
}
function parseurl_getsiteinfo($url) { function parseurl_getsiteinfo($url) {
$siteinfo = array(); $siteinfo = array();
@ -25,7 +47,8 @@ function parseurl_getsiteinfo($url) {
$header = curl_exec($ch); $header = curl_exec($ch);
curl_close($ch); curl_close($ch);
if (preg_match('/charset=(.*?)\n/', $header, $matches)) // Fetch the first mentioned charset. Can be in body or header
if (preg_match('/charset=(.*?)['."'".'"\s\n]/', $header, $matches))
$charset = trim(array_pop($matches)); $charset = trim(array_pop($matches));
else else
$charset = "utf-8"; $charset = "utf-8";
@ -57,11 +80,13 @@ function parseurl_getsiteinfo($url) {
$xpath = new DomXPath($doc); $xpath = new DomXPath($doc);
$list = $xpath->query("head/title"); //$list = $xpath->query("head/title");
$list = $xpath->query("//title");
foreach ($list as $node) foreach ($list as $node)
$siteinfo["title"] = html_entity_decode($node->nodeValue, ENT_QUOTES, "UTF-8"); $siteinfo["title"] = html_entity_decode($node->nodeValue, ENT_QUOTES, "UTF-8");
$list = $xpath->query("head/meta[@name]"); //$list = $xpath->query("head/meta[@name]");
$list = $xpath->query("//meta[@name]");
foreach ($list as $node) { foreach ($list as $node) {
$attr = array(); $attr = array();
if ($node->attributes->length) if ($node->attributes->length)
@ -86,7 +111,8 @@ function parseurl_getsiteinfo($url) {
} }
} }
$list = $xpath->query("head/meta[@property]"); //$list = $xpath->query("head/meta[@property]");
$list = $xpath->query("//meta[@property]");
foreach ($list as $node) { foreach ($list as $node) {
$attr = array(); $attr = array();
if ($node->attributes->length) if ($node->attributes->length)
@ -116,38 +142,32 @@ function parseurl_getsiteinfo($url) {
foreach ($node->attributes as $attribute) foreach ($node->attributes as $attribute)
$attr[$attribute->name] = $attribute->value; $attr[$attribute->name] = $attribute->value;
// guess mimetype from headers or filename $src = completeurl($attr["src"], $url);
$type = guess_image_type($attr["src"],true); $photodata = getimagesize($src);
$i = fetch_url($attr["src"]); if (($photodata[0] > 150) and ($photodata[1] > 150)) {
$ph = new Photo($i, $type); if ($photodata[0] > 300) {
$photodata[1] = $photodata[1] * (300 / $photodata[0]);
if($ph->is_valid() and ($ph->getWidth() > 200) and ($ph->getHeight() > 200)) { $photodata[0] = 300;
if ($siteinfo["image"] == "") }
$siteinfo["image"] = $attr["src"]; if ($photodata[1] > 300) {
$photodata[0] = $photodata[0] * (300 / $photodata[1]);
if($ph->getWidth() > 300 || $ph->getHeight() > 300) { $photodata[1] = 300;
$ph->scaleImage(300); }
$siteinfo["images"][] = array("src"=>$attr["src"], $siteinfo["images"][] = array("src"=>$src,
"width"=>$ph->getWidth(), "width"=>$photodata[0],
"height"=>$ph->getHeight()); "height"=>$photodata[1]);
} else
$siteinfo["images"][] = array("src"=>$attr["src"],
"width"=>$ph->getWidth(),
"height"=>$ph->getHeight());
} }
} }
} else { } else {
// guess mimetype from headers or filename $src = completeurl($siteinfo["image"], $url);
$type = guess_image_type($siteinfo["image"],true); $photodata = getimagesize($src);
$i = fetch_url($siteinfo["image"]); if (($photodata[0] > 10) and ($photodata[1] > 10))
$ph = new Photo($i, $type); $siteinfo["images"][] = array("src"=>$src,
"width"=>$photodata[0],
if($ph->is_valid()) "height"=>$photodata[1]);
$siteinfo["images"][] = array("src"=>$siteinfo["image"],
"width"=>$ph->getWidth(),
"height"=>$ph->getHeight());
} }
if ($siteinfo["text"] == "") { if ($siteinfo["text"] == "") {
@ -155,19 +175,22 @@ function parseurl_getsiteinfo($url) {
$list = $xpath->query("//div[@class='article']"); $list = $xpath->query("//div[@class='article']");
foreach ($list as $node) foreach ($list as $node)
$text .= " ".trim($node->nodeValue); if (strlen($node->nodeValue) > 40)
$text .= " ".trim($node->nodeValue);
if ($text == "") { if ($text == "") {
$list = $xpath->query("//div[@class='content']"); $list = $xpath->query("//div[@class='content']");
foreach ($list as $node) foreach ($list as $node)
$text .= " ".trim($node->nodeValue); if (strlen($node->nodeValue) > 40)
$text .= " ".trim($node->nodeValue);
} }
// If none text was found then take the paragraph content // If none text was found then take the paragraph content
if ($text == "") { if ($text == "") {
$list = $xpath->query("//p"); $list = $xpath->query("//p");
foreach ($list as $node) foreach ($list as $node)
$text .= " ".trim($node->nodeValue); if (strlen($node->nodeValue) > 40)
$text .= " ".trim($node->nodeValue);
} }
if ($text != "") { if ($text != "") {
@ -238,9 +261,9 @@ function parse_url_content(&$a) {
if($url && $title && $text) { if($url && $title && $text) {
if($textmode) if($textmode)
$text = $br . $br . '[quote]' . trim($text) . '[/quote]' . $br; $text = $br . '[quote]' . trim($text) . '[/quote]' . $br;
else else
$text = '<br /><br /><blockquote>' . trim($text) . '</blockquote><br />'; $text = '<br /><blockquote>' . trim($text) . '</blockquote><br />';
$title = str_replace(array("\r","\n"),array('',''),$title); $title = str_replace(array("\r","\n"),array('',''),$title);
@ -255,7 +278,8 @@ function parse_url_content(&$a) {
$siteinfo = parseurl_getsiteinfo($url); $siteinfo = parseurl_getsiteinfo($url);
if($siteinfo["title"] == "") { if($siteinfo["title"] == "") {
echo sprintf($template,$url,$url,'') . $str_tags; echo print_r($siteinfo, true);
//echo sprintf($template,$url,$url,'') . $str_tags;
killme(); killme();
} else { } else {
$text = $siteinfo["text"]; $text = $siteinfo["text"];
@ -305,7 +329,7 @@ function parse_url_content(&$a) {
} }
if($image) { if($image) {
$text = $br.$br.$image.$br.$text; $text = $br.$br.$image.$text;
} }
$title = str_replace(array("\r","\n"),array('',''),$title); $title = str_replace(array("\r","\n"),array('',''),$title);
@ -313,6 +337,6 @@ function parse_url_content(&$a) {
logger('parse_url: returns: ' . $result); logger('parse_url: returns: ' . $result);
echo $result; echo trim($result);
killme(); killme();
} }