本段代码用于对key_word.txt中的搜索词进行百度搜索,获取百度内容页(首页)的推广广告数。
其中为了防止特例偏差,所以每个关键词爬取了10次,并取均值作为最终展示的广告数量。
主要运用的Beautifulsoup函数库,爬取代码在request函数中.
Beautifulsoup手册:https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html
import requests
from bs4 import BeautifulSoup
from itertools import repeat
# 发送HTTP请求时的HEAD信息,用于伪装为浏览器
headersParameters = {
'Connection': 'Keep-Alive',
'Accept': 'text/html, application/xhtml+xml, */*',
'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
'Accept-Encoding': 'gzip, deflate',
'User-Agent': 'Mozilla/6.1 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
}
#迭代取均值
def develop(line):
count = 0
for i in range(10):
count = request(line,count)
num = count/10
print("关键词:" + line + "\t总广告数:", count)
data.write(line + "\t" + str(num) + "\n")
#爬取整理
def request(key_word,count):
httpRsp = requests.get("http://www.baidu.com/s?wd="+key_word, headers=headersParameters)
# httpResult = requests.get("https://www.zxxblog.cn", headers=headersParameters)
# print(httpRsp.text)
if httpRsp.status_code != 200:
print("数据获取失败")
else:
soup = BeautifulSoup(httpRsp.text, "lxml")
result = soup.find_all('span')
#print(soup.prettify())
for arr in result:
if "广告" in arr:
count = count + 1
return count
#MAIN方法
if __name__=="__main__":
data = open('data.txt', 'w+', encoding='utf-8')
file = open('key_word.txt', 'r', encoding='utf-8')
data.write("关键词\t广告均数\n")
for line in file:
line = line.strip('\n')
develop(line)
#END
file.close()
data.close()
最后,感谢在网上冲浪分享技术的GGMM们
参考教程:浅书 https://www.zxxblog.cn/article/92