应该是有限制搜索次数,有的时候搜索会出失误,这个后续会继续更新。现在基本能用就行
#!/usr/bin/env python
#-*- coding:utf-8 -*-
# file:百度搜索爬取.py
# author:ytytyt
# datetime:2021/7/14 18:06
# software: PyCharm
'''
url:https://www.baidu.com/s?ie=UTF-8&wd=%E4%B8%AD%E5%9B%BD%E5%8C%96%E5%B7%A5%E8%A3%85%E5%A4%87%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8
https://www.baidu.com/s?ie=UTF-8&wd=%E9%98%BF%E5%B0%94%E5%8D%91%E6%96%AF(%E4%B8%AD%E5%9B%BD)%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8
'''
# import module your need
import requests
from requests.exceptions import RequestException
from pyquery import PyQuery as pq
import pandas as pd
import time,random
def getdata():
with open('ent_name.csv','r',encoding='gbk') as file1:
data = pd.read_csv(file1)
data=data['enterpriseName'].tolist()
return data
def getpage(url):
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
}
try:
res = requests.get(url, headers=headers)
if res.status_code == 200:
res.encoding = "utf-8"
return res.text
else:
print(res.status_code)
return None
except RequestException as e:
print(e)
return None
def parsePage(html):
doc=pq(html)
result = doc('#content_left')
for item in result.items():
doc2 = pq(item)
res = doc2('.result#1 .t a')
if not res:
res = doc2('.result#2 .t a')
if not res:
res = doc2('.result#3 .t a')
if not res:
return 'nodata'
for item2 in res.items():
return item2.attr.href
def datasave(content):
"""数据写入数据库"""
pass
def main(name_list):
for name in name_list:
url = 'https://www.baidu.com/s?ie=UTF-8&wd='+name
html = getpage(url)
print(parsePage(html))
#生成随机0-1小数
ran = random.random()*4
time.sleep(ran)
if __name__ == '__main__':
#此处写入关键字列表即可如['aaa','bbb'],我这里从csv读出来的数据,你改成列表也一样程序会按照关键字进行搜索找到符合的第一个数据,这里往下找三次就不找了
main(getdata())