PHP file_get_contents 和 curl 远程在线内容抓取比较
在日常PHP开发中, 经常需要抓取远程数据, 常用的方式为: file_get_contents 和 curl, 本文结合实际经验进行分析比较, 希望对大家有所帮助
{基本用法}
> file_get_contents
$content = file_get_contents("http://yiilib.com"); //Html content
> curl
// 1. init
$ch = curl_init();
// 2. config
curl_setopt($ch, CURLOPT_URL, "http://yiilib.com");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HEADER, 0);
// 3. load html
$output = curl_exec($ch);
// 4. close curl
curl_close($ch);
{对比/区别}
> 用时
20次抓取yiilib.com平均值
方法 | 用时 |
file_get_contents | 2.1 - 2.6s |
curl | 0.5 - 0.8s |
> 稳定性
curl的稳定性明显优于file_get_contents
{建议}
建议在项目中优先使用curl作为抓取工具
{Demo Code}
public static function load($url, $header=null, $https = false){
if (empty($url)) {
return '';
}//do nothing for empty
//init
$ch = curl_init();
//set options
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); //return result
curl_setopt($ch, CURLOPT_FOLLOWLOCATION,1); //follow 301
//user agent
$userAgent = 'Mozilla/5.0 (compatible; MSIE 5.01; Windows NT 5.0)';
curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); //set User-Agent
//outtime
curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT, 6); //connection timeout
curl_setopt ($ch, CURLOPT_TIMEOUT, 20); //timeout
//set more header
if (!empty($header)) {
curl_setopt($ch, CURLOPT_HTTPHEADER , $header );
}
curl_setopt($ch, CURLOPT_HEADER, 0); //no need response header, faster
//for https, skip verify paper and host
if ($https) {
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
}
//get result
$output = curl_exec($ch);
//close
curl_close($ch);
//return
return $output;
}
留言