PHP使用file_get_contents与curl直接采集网页时数据获取不完整,本文给予解决办法
问题
file_get_contents与curl直接采集网页时数据获取不完整
直接使用file_get_contents获取网页数据在大量获取时会出现不完整情况,后改进为:
function eyz_file_get_contents($collectUrl,$timeOut = 10){
//设置超时参数
$opts=array(
"http"=>array(
"method"=>"GET",
"timeout"=>$timeOut,
'header'=> "Accept-Encoding: gzip, deflate, sdch\r\n"//在请求的时候告诉服务器支持解Gzip压缩的内容
),
);
////创建数据流上下文
$context = stream_context_create($opts);
return @file_get_contents("compress.zlib://".$collectUrl,0,$context);
}
通过上代码有所改善,但依旧存在不全的情况
解决方式
使用以下代码即可解决
function req_curl($url, &$status = null, $options = array())
{
$res = '';
$options = array_merge(array(
'follow_local' => true,
'timeout' => 30,
'max_redirects' => 4,
'binary_transfer' => false,
'include_header' => false,
'no_body' => false,
'cookie_location' => dirname(__FILE__) . '/cookie',
'useragent' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1',
'post' => array() ,
'referer' => null,
'ssl_verifypeer' => 0,
'ssl_verifyhost' => 0,
'headers' => array(
'Expect:'
) ,
'auth_name' => '',
'auth_pass' => '',
'session' => false
) , $options);
$options['url'] = $url;
$s = curl_init();
if (!$s) return false;
curl_setopt($s, CURLOPT_URL, $options['url']);
curl_setopt($s, CURLOPT_HTTPHEADER, $options['headers']);
curl_setopt($s, CURLOPT_SSL_VERIFYPEER, $options['ssl_verifypeer']);
curl_setopt($s, CURLOPT_SSL_VERIFYHOST, $options['ssl_verifyhost']);
curl_setopt($s, CURLOPT_TIMEOUT, $options['timeout']);
curl_setopt($s, CURLOPT_MAXREDIRS, $options['max_redirects']);
curl_setopt($s, CURLOPT_RETURNTRANSFER, true);
curl_setopt($s, CURLOPT_FOLLOWLOCATION, $options['follow_local']);
curl_setopt($s, CURLOPT_COOKIEJAR, $options['cookie_location']);
curl_setopt($s, CURLOPT_COOKIEFILE, $options['cookie_location']);
if (!empty($options['auth_name']) && is_string($options['auth_name']))
{
curl_setopt($s, CURLOPT_USERPWD, $options['auth_name'] . ':' . $options['auth_pass']);
}
if (!empty($options['post']))
{
curl_setopt($s, CURLOPT_POST, true);
curl_setopt($s, CURLOPT_POSTFIELDS, $options['post']);
//curl_setopt($s, CURLOPT_POSTFIELDS, array('username' => 'aeon', 'password' => '111111'));
}
if ($options['include_header'])
{
curl_setopt($s, CURLOPT_HEADER, true);
}
if ($options['no_body'])
{
curl_setopt($s, CURLOPT_NOBODY, true);
}
if ($options['session'])
{
curl_setopt($s, CURLOPT_COOKIESESSION, true);
curl_setopt($s, CURLOPT_COOKIE, $options['session']);
}
curl_setopt($s, CURLOPT_USERAGENT, $options['useragent']);
curl_setopt($s, CURLOPT_REFERER, $options['referer']);
$res = curl_exec($s);
$status = curl_getinfo($s, CURLINFO_HTTP_CODE);
curl_close($s);
return $res;
}
以上是一个更完整的curl的封装,可以相对的比较好的获取页面信息,curl的获取当数据量过大时,会分批进行数据获取,本地进行数据重组
至此,问题解决
目前评论:0