PHP Crawler
For crawling in PHP I have always used the fantastic cURL.
My curl single-threaded function:
{
$agent = “Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)”;
$ch = curl_init();
curl_setopt($ch, CURLOPT_NOSIGNAL, 1);
curl_setopt($ch, CURLOPT_NOPROGRESS, 1);
curl_setopt($ch, CURLOPT_FAILONERROR, 1);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_USERAGENT, $agent);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_MAXREDIRS, 1);
curl_setopt($ch, CURLOPT_TIMEOUT, 5);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
$html = curl_exec($ch);
curl_close ($ch);
return $html;
}
My curl multi-threaded function:
{
$agent = “Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)”;
$mh = curl_multi_init();
foreach ($urls as $i => $url)
{
$conn[$i] = curl_init($url);
curl_setopt($conn[$i], CURLOPT_RETURNTRANSFER, 1);
curl_setopt($conn[$i], CURLOPT_NOSIGNAL, 1);
curl_setopt($conn[$i], CURLOPT_NOPROGRESS, 1);
curl_setopt($conn[$i], CURLOPT_FAILONERROR, 1);
curl_setopt($conn[$i], CURLOPT_URL, $url);
curl_setopt($conn[$i], CURLOPT_USERAGENT, $agent);
curl_setopt($conn[$i], CURLOPT_SSL_VERIFYPEER, 0);
curl_setopt($conn[$i], CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($conn[$i], CURLOPT_MAXREDIRS, 1);
curl_setopt($conn[$i], CURLOPT_TIMEOUT, $timeout);
curl_multi_add_handle ($mh, $conn[$i]);
}
do
{
$mrc = curl_multi_exec($mh, $active);
}
while ($mrc == CURLM_CALL_MULTI_PERFORM);
while ($active and $mrc == CURLM_OK)
{
if (curl_multi_select($mh) != -1)
{
do
{
$mrc = curl_multi_exec($mh, $active);
}
while ($mrc == CURLM_CALL_MULTI_PERFORM);
}
}
if ($mrc != CURLM_OK)
{
print “Curl multi read error $mrc\n“;
}
$res = array();
$e = 0;
foreach ($urls as $i => $url)
{
if (($err = curl_error($conn[$i])) == ”)
{
$res[$i]=curl_multi_getcontent($conn[$i]);
}
else
{
if ($verbose == “yes”){
echo “error: “.$url.” (”.$err.“)\n“;
}else{
$e++;
}
}
curl_multi_remove_handle($mh,$conn[$i]);
curl_close($conn[$i]);
}
curl_multi_close($mh);
$s = count($urls)-$e;
if ($verbose == “no”){
echo “errors “.$e.” | success “.$s.“\n“;
}
return $res;
}
However there are some annoyances in curl - the main one for me being that you can’t pass variables to the write_function,
which makes it useless for updating rows etc in a db (you can use curl_getinfo to get the url so do a lookup - but that is pretty backwards). This means that the crawling is not even close to being truely multithreaded as you have to wait for all urls to finish before working with the data.
So I thought I’d have a go at writing the raw crawler myself using fsockopen. Is not perfect as the multithread function does require the single thread one to follow any redirects.
My own single-threaded function:
{
$urlinfo = parse_url($url);
if (empty($urlinfo[’scheme’])) {$urlinfo = parse_url(‘http://’.$url);}
if (empty($urlinfo[“path”])) {$urlinfo[“path”]=“/”;}
if (empty($urlinfo[‘port’]))
{
switch($urlinfo[’scheme’])
{
case “http”:
$urlinfo[‘port’] = 80;
break;
case “https”:
$urlinfo[‘port’] = 443;
break;
}
}
if (isset($urlinfo[“query”]))
{
$request = “GET “.$urlinfo[“path”].“?”.$urlinfo[“query”].” “;
} else {
$request = “GET “.$urlinfo[“path”].” “;
}
$request .= “HTTP/1.0\r\n“;
$request .= “Host: “.$urlinfo[‘host’].“\r\n“;
$request .= “User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)\r\n“;
$request .= “Connection: close\r\n\r\n“;
$fp = fsockopen($urlinfo[‘host’], $urlinfo[‘port’], $errno, $errstr, $timeout);
if (!$fp)
{
echo “(”.$errno.“)”.$errstr.“\n“;
}
else
{
fwrite($fp, $request);
while (!feof($fp))
{
$data .= fgets($fp, 4096);
}
fclose($fp);
$tmp = explode(“\r\n\r\n“, $data, 2);
$urlinfo[‘header’] = $tmp[0];
$urlinfo[‘html’] = $tmp[1];
if ((stripos($urlinfo[‘header’], “location:”)) && ($maxredirs > 0))
{
preg_match(“/\r\nlocation:(.*)/i”, $urlinfo[‘header’], $match);
if ($match)
{
$redirect = trim($match[1]);
echo “Redirecting to “.$redirect.“\n“;
$maxredirs–;
return mycrawler_single($redirect, $timeout, $maxredirs);
}
}
return $urlinfo;
}
}
My own multi-threaded function:
{
for ($i=0; $i<count($urls); $i++)
{
$urlinfo[$i] = parse_url($urls[$i]);
$maxredirs[$i] = $maxredirects;
if (empty($urlinfo[$i][’scheme’])) {$urlinfo[$i] = parse_url(‘http://’.$url);}
if (empty($urlinfo[$i][“path”])) {$urlinfo[$i][“path”]=“/”;}
if (empty($urlinfo[$i][‘port’]))
{
switch($urlinfo[$i][’scheme’])
{
case “http”:
$urlinfo[$i][‘port’] = 80;
break;
case “https”:
$urlinfo[$i][‘port’] = 443;
break;
}
}
if (isset($urlinfo[$i][“query”]))
{
$request[$i] = “GET “.$urlinfo[$i][“path”].“?”.$urlinfo[$i][“query”].” “;
} else {
$request[$i] = “GET “.$urlinfo[$i][“path”].” “;
}
$request[$i] .= “HTTP/1.0\r\n“;
$request[$i] .= “Host: “.$urlinfo[$i][‘host’].“\r\n“;
$request[$i] .= “User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)\r\n“;
$request[$i] .= “Connection: close\r\n\r\n“;
$fp[$i] = fsockopen($urlinfo[$i][‘host’], $urlinfo[$i][‘port’], $urlinfo[$i][‘errno’], $urlinfo[$i][‘errstr’], $timeout);
socket_set_blocking($fp[$i], false);
if (!$fp[$i])
{
echo “(”.$urlinfo[$i][‘errno’].“)”.$urlinfo[$i][‘errstr’].“\n“;
}
else
{
fwrite($fp[$i], $request[$i]);
}
}
$done = false;
$numdone = array();
while (!$done)
{
for ($i=0; $i<count($urls); $i++)
{
if (!feof($fp[$i]))
{
$data[$i] .= fgets($fp[$i], 4096);
}
elseif (empty($numdone[$i]))
{
$numdone[$i] = 1;
$tmp[$i] = explode(“\r\n\r\n“, $data[$i], 2);
$urlinfo[$i][‘header’] = $tmp[$i][0];
$urlinfo[$i][‘html’] = $tmp[$i][1];
if ((stripos($urlinfo[$i][‘header’], “location:”)) && ($maxredirs[$i] > 0))
{
preg_match(“/\r\nlocation:(.*)/i”, $urlinfo[$i][‘header’], $match[$i]);
if ($match[$i])
{
$redirect[$i] = trim($match[$i][1]);
echo “Redirecting to “.$redirect[$i].“\n“;
$maxredirs[$i]–;
$urlinfo[$i] = mycrawler_single($redirect[$i], $timeout, $maxredirs[$i]);
}
}
}
}
$done = (array_sum($numdone) == count($urls));
}
for ($i=0; $i<count($urls); $i++)
{
fclose($fp[$i]);
}
return $urlinfo;
}
All require PHP5.
I’ve been trying to do much of the same thing on a site I have been working on. The problem I have been running up against has been fSockOpen; it waits for a connection to the server before returning. As such, when you loop through an array of URLs, it can take maybe twice as long to connect and retrieve data as with cURL multi. The fastest solution I have seen involved fSockOpen connecting to the local machine (aka PHP multithread HACK), and then letting each PHPlet file do the crawling independantly. This cuts even the cURL multi in half, but is not very pratical on the server load (1 page = 30+ httpd files running). Speed wise, have you compared your PHP function with the cURL function?