PHP采集网页数据不完整问题,file_get_contents与curl获取网页数据不全的解决办法

PHP使用file_get_contents与curl直接采集网页时数据获取不完整,本文给予解决办法

问题

file_get_contents与curl直接采集网页时数据获取不完整

直接使用file_get_contents获取网页数据在大量获取时会出现不完整情况,后改进为:

function eyz_file_get_contents($collectUrl,$timeOut = 10){
    //设置超时参数
    $opts=array(
        "http"=>array(
            "method"=>"GET",
            "timeout"=>$timeOut,
            'header'=> "Accept-Encoding: gzip, deflate, sdch\r\n"//在请求的时候告诉服务器支持解Gzip压缩的内容
        ),
    );
    ////创建数据流上下文
    $context = stream_context_create($opts);
    return @file_get_contents("compress.zlib://".$collectUrl,0,$context);
}

通过上代码有所改善,但依旧存在不全的情况

解决方式

使用以下代码即可解决

function req_curl($url, &$status = null, $options = array())
{
    $res = '';
    $options = array_merge(array(
        'follow_local' => true,
        'timeout' => 30,
        'max_redirects' => 4,
        'binary_transfer' => false,
        'include_header' => false,
        'no_body' => false,
        'cookie_location' => dirname(__FILE__) . '/cookie',
        'useragent' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1',
        'post' => array() ,
        'referer' => null,
        'ssl_verifypeer' => 0,
        'ssl_verifyhost' => 0,
        'headers' => array(
            'Expect:'
        ) ,
        'auth_name' => '',
        'auth_pass' => '',
        'session' => false
    ) , $options);
    $options['url'] = $url;

    $s = curl_init();

    if (!$s) return false;

    curl_setopt($s, CURLOPT_URL, $options['url']);
    curl_setopt($s, CURLOPT_HTTPHEADER, $options['headers']);
    curl_setopt($s, CURLOPT_SSL_VERIFYPEER, $options['ssl_verifypeer']);
    curl_setopt($s, CURLOPT_SSL_VERIFYHOST, $options['ssl_verifyhost']);
    curl_setopt($s, CURLOPT_TIMEOUT, $options['timeout']);
    curl_setopt($s, CURLOPT_MAXREDIRS, $options['max_redirects']);
    curl_setopt($s, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($s, CURLOPT_FOLLOWLOCATION, $options['follow_local']);
    curl_setopt($s, CURLOPT_COOKIEJAR, $options['cookie_location']);
    curl_setopt($s, CURLOPT_COOKIEFILE, $options['cookie_location']);
    if (!empty($options['auth_name']) && is_string($options['auth_name']))
    {
        curl_setopt($s, CURLOPT_USERPWD, $options['auth_name'] . ':' . $options['auth_pass']);
    }
    if (!empty($options['post']))
    {
        curl_setopt($s, CURLOPT_POST, true);
        curl_setopt($s, CURLOPT_POSTFIELDS, $options['post']);
        //curl_setopt($s, CURLOPT_POSTFIELDS, array('username' => 'aeon', 'password' => '111111'));
    }
    if ($options['include_header'])
    {
        curl_setopt($s, CURLOPT_HEADER, true);
    }
    if ($options['no_body'])
    {

        curl_setopt($s, CURLOPT_NOBODY, true);
    }
    if ($options['session'])
    {
        curl_setopt($s, CURLOPT_COOKIESESSION, true);
        curl_setopt($s, CURLOPT_COOKIE, $options['session']);
    }
    curl_setopt($s, CURLOPT_USERAGENT, $options['useragent']);
    curl_setopt($s, CURLOPT_REFERER, $options['referer']);
    $res = curl_exec($s);
    $status = curl_getinfo($s, CURLINFO_HTTP_CODE);
    curl_close($s);
    return $res;
}

以上是一个更完整的curl的封装,可以相对的比较好的获取页面信息,curl的获取当数据量过大时,会分批进行数据获取,本地进行数据重组

至此,问题解决

admin

发表评论

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen: