MENU

【笔记】PHP Curl 爬取资源

2021 年 03 月 18 日 • 阅读: 481 • 笔记

今天学习了一下 PHP 的爬虫 Curl,做一下笔记备用。


Curl 介绍:

curl 是利用 URL 语法在命令行方式下工作的开源文件传输工具,他能够从互联网上获得各种各样的网络资源。简单来说,curl 就是抓取页面的升级版。

基本模型:

pic45

基本格式:

<?php
$curlobj = curl_init();          // 初始化
curl_setopt($curlobj, CURLOPT_URL, "http://www.baidu.com");       // 设置访问网页的URL
curl_setopt($curlobj, CURLOPT_RETURNTRANSFER, true);         // 执行之后不直接打印出来
$output=curl_exec($curlobj);   // 执行
curl_close($curlobj);        // 关闭cURL
echo str_replace("百度","php",$output);
?>

案例:(来自 PHP 中文网)

<?php
    header("Content-Type:text/html; charset=utf-8");
    $data = 'theCityName=北京';
    $curlobj = curl_init();
    curl_setopt($curlobj, CURLOPT_URL, "http://www.webxml.com.cn/WebServices/WeatherWebService.asmx/getWeatherbyCityName");
    curl_setopt($curlobj, CURLOPT_USERAGENT, "user-agent:Mozilla/5.0 (Windows NT 5.1; rv:24.0) Gecko/20100101 Firefox/24.0");
    curl_setopt($curlobj, CURLOPT_HEADER, 0); //启用时会将头文件的信息作为数据流输出。这里不启用
    curl_setopt($curlobj, CURLOPT_RETURNTRANSFER, 1); //如果成功只将结果返回,不自动输出任何内容。如果失败返回FALSE
    curl_setopt($curlobj, CURLOPT_POST, 1); //如果你想PHP去做一个正规的HTTP POST,设置这个选项为一个非零值。这个POST是普通的 application/x-www-from-urlencoded 类型,多数被HTML表单使用。
    curl_setopt($curlobj, CURLOPT_POSTFIELDS, $data); //需要POST的数据
    curl_setopt($curlobj, CURLOPT_HTTPHEADER, array("application/x-www-form-urlencoded; 
 charset=utf-8", "Content-length: ".strlen($data)));
    $rtn = curl_exec($curlobj);
    if(!curl_errno($curlobj)){
        // $info = curl_getinfo($curlobj);
        // print_r($info);
        echo $rtn;
    } else {
        echo 'Curl error: ' . curl_error($curlobj);
    }
    curl_close($curlobj);
?>

网友封装的万能 Curl:

<?php
/**
 * @author 教书先生
 * @link https://blog.oioweb.cn
 * @date 2020年11月12日18:00:30
 * @msg PHPCurl封装的方法
 */
function teacher_curl($url, $paras = [])
{
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
    curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
    if ($paras['Header']) {
        $Header = $paras['Header'];
    } else {
        $Header[] = "Accept:*/*";
        $Header[] = "Accept-Encoding:gzip,deflate,sdch";
        $Header[] = "Accept-Language:zh-CN,zh;q=0.8";
        $Header[] = "Connection:close";
    }
    curl_setopt($ch, CURLOPT_HTTPHEADER, $Header);
    if ($paras['ctime']) { // 连接超时
        curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $paras['ctime']);
    } else {
        curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30);
    }
    if ($paras['rtime']) { // 读取超时
        curl_setopt($ch, CURLOPT_TIMEOUT, $paras['rtime']);
    }
    if ($paras['post']) {
        curl_setopt($ch, CURLOPT_POST, 1);
        curl_setopt($ch, CURLOPT_POSTFIELDS, $paras['post']);
    }
    if ($paras['header']) {
        curl_setopt($ch, CURLOPT_HEADER, true);
    }
    if ($paras['cookie']) {
        curl_setopt($ch, CURLOPT_COOKIE, $paras['cookie']);
    }
    if ($paras['refer']) {
        if ($paras['refer'] == 1) {
            curl_setopt($ch, CURLOPT_REFERER, 'http://m.qzone.com/infocenter?g_f=');
        } else {
            curl_setopt($ch, CURLOPT_REFERER, $paras['refer']);
        }
    }
    if ($paras['ua']) {
        curl_setopt($ch, CURLOPT_USERAGENT, $paras['ua']);
    } else {
        curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36");
    }
    if ($paras['nobody']) {
        curl_setopt($ch, CURLOPT_NOBODY, 1);
    }
    curl_setopt($ch, CURLOPT_ENCODING, "gzip");
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    if ($paras['GetCookie']) {
        curl_setopt($ch, CURLOPT_HEADER, 1);
        $result = curl_exec($ch);
        preg_match_all("/Set-Cookie: (.*?);/m", $result, $matches);
        $headerSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
        $header = substr($result, 0, $headerSize); //状态码
        $body = substr($result, $headerSize);
        $ret = [
            "Cookie" => $matches, "body" => $body, "header" => $header, 'code' => curl_getinfo($ch, CURLINFO_HTTP_CODE),
        ];
        curl_close($ch);
        return $ret;
    }
    $ret = curl_exec($ch);
    if ($paras['loadurl']) {
        $Headers = curl_getinfo($ch);
        if (isset($Headers['redirect_url'])) {
            $ret = $Headers['redirect_url'];
        } else {
            $ret = false;
        }
    }
    curl_close($ch);
    return $ret;
}

食用方法:

Get 访问:

echo teacher_curl("https://api.oioweb.cn/api/beian.php?url=qq.com");

Post 访问:

//第一种方式
echo teacher_curl("https://api.oioweb.cn/api/beian.php",[
    'post'=>[
        'url'=>'qq.com'
    ]
]);
//第二种方式
echo teacher_curl("https://api.oioweb.cn/api/beian.php",[
    'post'=>'url=qq.com'
]);

携带 Cookie 访问:

echo teacher_curl("https://api.oioweb.cn/api/beian.php?url=qq.com",[
    'cookie'=>'cookie内容'
]);

模拟访问来源 Refer:

echo teacher_curl("https://api.oioweb.cn/api/beian.php?url=qq.com",[
    'refer'=>'https://api.oioweb.cn'
]);

模拟 UserAgent:

//第一种方式
echo teacher_curl("https://api.oioweb.cn/api/beian.php?url=qq.com",[
    'ua'=>'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
]);
//第二种方式
echo teacher_curl("https://api.oioweb.cn/api/beian.php?url=qq.com",[
    'post'=>new CURLFile(realpath("Curl.jpg"))
]);

文件上传:

echo teacher_curl("https://api.oioweb.cn/api/beian.php?url=qq.com",[
    'post'=>[
        'file'=>new CURLFile(realpath("Curl.jpg"))
    ]
]);

获取 301 跳转:

echo teacher_curl("https://mmbizurl.cn/s/RNHSo6Dek",[
    'loadurl'=>1
]);

查看返回 Header 信息:

echo teacher_curl("https://api.oioweb.cn/api/beian.php?url=qq.com",[
    'header'=>1
]);

设置请求头:

echo teacher_curl("https://api.oioweb.cn/api/beian.php?url=qq.com",[
    'Header'=>[
        'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
accept-encoding: gzip, deflate, br
accept-language: zh-CN,zh;q=0.9
cache-control: max-age=0'
    ]
]);

获取请求的全部信息:

echo teacher_curl("https://api.oioweb.cn/api/beian.php?url=qq.com",[
    'post'=>[
        'user'=>123456,
        'pwd'=>123
    ],
    'GetCookie'=>1
]);

Gitee : PHP_Curl

返回文章列表 文章二维码
本页链接的二维码
打赏二维码

信阳