由于平时我们需要快速的在搜搜引擎上搜索数据,我们可以调用诸如百度搜索引擎的搜搜接口。格式如下
https://www.baidu.com/s?wd=你的搜索内容,涉及到url链接用URL编码
可以在网站上添加 a 标记跳转到百度,进行快速搜索,平时是在浏览器上打开的。由于请求时附带了请求信息,ua等,使用访问不会被百度屏蔽和拦截。
如果说我们喜欢在代码层面获悉,相关的搜索内容,怎么办呢?以PHP代码为例,通过添加部分必要请求头,绕过百度检测。
经过分析,必要请求头如下:
User-Agent 操作系统信息字段
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36
Accept 浏览器返回接收数据格式、配置
text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
Accept-Language 接收的语言类型
zh-CN,zh;q=0.9
不强制登录状态请求,部分操作需要登录可以附带cookie信息
PHP代码封装成爬虫类
class Spider {
//存放未登录 返回的cookie信息的
protected $cookie_nologin_path = './cookie_nologin.txt';
//存放已登录的cookie信息
protected $cookie_login_path = './cookie_login.txt';
//appid
protected $appid;
protected $appkey;
protected $token;
//expired cookie有效期 单位 s
protected $expired = 3600;
protected $url = '';
//搜索引擎地址
function __construct() {
}
//返回附带给百度接口的头信息
public function getHeaders() {
return ['User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36','Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3','Accept-Language: zh-CN,zh;q=0.9'];
}
public function getCookie($url,$flush=false) {
$lasttime = filemtime($this->cookie_nologin_path);
if (!$flush && $lasttime>0 && time()-$lasttime<$this->expired) {
return file_get_contents($this->cookie_nologin_path);
}
$info = parse_url($url);
$url = $info['scheme'].'://'.$info['host'];
$data = $this->curlGet($url,['showheader'=>1,'headers'=>$this->getHeaders(),'returnheader'=>1]);
$content = preg_split("/\r\n/", $data);
$cookie = '';
foreach ($content as $k => $v) {
if ($v && strpos($v, 'Set-Cookie')!==false) {
$v = trim(str_replace('Set-Cookie:', '', $v));
$cookie= $cookie.$v;
}
}
if ($cookie) {
//缓存cookie
file_put_contents($this->cookie_nologin_path,$cookie);
}
return $cookie;
}
public function curlGet($url,$options) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
// 是否返回响应头
if ($options['showheader']) {
curl_setopt($ch, CURLOPT_HEADER, 1);
} else {
curl_setopt($ch, CURLOPT_HEADER, 0);
}
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 0);
if ($options['cookie']) {
// debug($options['cookie']);
curl_setopt($ch, CURLOPT_COOKIE, $options['cookie']);
}
// curl_setopt($ch, CURLOPT_USERAGENT, 'User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36');
if ($options['headers']) {
curl_setopt($ch,CURLOPT_HTTPHEADER ,(array)$options['headers']);
}
//curl_setopt($ch, CURLOPT_SSLVERSION, 1);
$result = curl_exec($ch);
// 只返回头部信息
if ($options['returnheader']) {
// 获得响应结果里的:头大小
$headerSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
$result = substr($result, 0, $headerSize);
}
curl_close($ch);
return $result;
}
}
class BaiduSpider extends Spider {
function __construct() {
parent::__construct();
$this->url = 'https://www.baidu.com';
$this->cookie_nologin_path = './baidu_cookie_nologin.txt';
//存放已登录的cookie信息
$this->cookie_login_path = './baidu_cookie_login.txt';
}
// 检测当前网址是否已被搜索引擎搜索
public function isCollectPage($url) {
$url = "{$this->url}/s?wd=".urlencode($url);
$data = $this->curlGet($url,['headers'=>$this->getHeaders(),'cookie'=>$this->getCookie($url)]);
if (!$data) {
return false;
}
//解析返回的数据
if (strpos($data,"result c-container")!==false) {
return true;
} else {
return false;
}
}
// 得到域名的收入数
public function getCollectNum($url) {
$params = parse_url($url);
$url = "{$this->url}/s?wd=".urlencode("site:".$params['host']);
// debug($url);
$data = $this->curlGet($url,['headers'=>$this->getHeaders(),'cookie'=>$this->getCookie($url)]);
if (!$data) {
return 0;
}
if (preg_match("/找到相关结果数约(\d+?)个/",$data,$match)) {
return $match[1];
} else if (preg_match("/该网站共有([\s\S.]+?)个网页被百度收录/",$data,$match)) {
return trim(str_replace(",","",strip_tags($match[1])));
} else {
return 0;
}
}
// 初始化自助操作
public function initApi($params) {
if ($params['appid']) {
$this->appid = $params['appid'];
}
if ($params['appkey']) {
$this->appkey = $params['appkey'];
}
if ($params['token']) {
$this->token = $params['token'];
}
}
/**
* 提交网址给该搜索引擎
* urls 是数组 array(
* '路径1',
* '路径2l',
* );
* 注释:带http 或者https的完整链接
* token 是您申请的key 百度站长平台中获取 https://ziyuan.baidu.com/
* domain 是您的网站域名
**/
public function submitWeb($urls=[]) {
if (!is_array($urls)) {
$urls = [$urls];
}
$domain = parse_url($urls[0],PHP_URL_HOST);
$api = 'http://data.zz.baidu.com/urls?site'.$domain.'&token='.$this->token;
$ch = curl_init();
$options = array(
CURLOPT_URL => $api,
CURLOPT_POST => true,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_POSTFIELDS => implode("\n", $urls),
CURLOPT_HTTPHEADER => array('Content-Type: text/plain'),
);
curl_setopt_array($ch, $options);
$result = curl_exec($ch);
return $result;
}
}
使用
$baidu = new BaiduSpider();
$url = 'https://blog.nango.top/admin/article/index.html';
$data = $baidu->getCollectNum($url);
如果觉得麻烦可以使用现成的API接口 功能丰富