利用了BeautifulSoup+CSS选择器,
代码如下
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
import time
#获取URL的函数
def get_url(url):
wb_data = requests.get(url)
wb_data.encoding = ('gbk')
soup = BeautifulSoup(wb_data.text, 'html.parser')
return soup
#构造爬取函数
def get_page(url,data=None):
soup = get_url(url)
#捕获href
for a in soup.find_all('a',class_ = 'ti', href=True):
URL = a['href']
print("Found the URL:", a['href'])
soup_1 = get_url(URL)
time.sleep(0.2)
titles = soup_1.select('span.ask-title')
answer = soup_1.select("div[class = 'best-text mb-10']")
# 在获取到的数据提取有效内容
for title, answer in zip(titles, answer):
data = [
title.get_text(),
# replace去除一些和主题不相关的字符
answer.get_text().replace("\n", "").replace("展开全部", "").replace("\u3000\u3000","")
]
saveFile(data)
print(data)
#迭代页数
def get_more_page(start,end):
for one in range(start,end,10):
get_page(url+str(one))
time.sleep(0.1)
#定义保存文件函数
def saveFile(data):
path = 'data_1\{}.csv'.format(keyword)
file = open(path, 'a')
file.write(str(data).replace('[',' ').replace('\'',' ').replace(']',' '))
file.write('\n')
file.close()
if __name__=="__main__":
with open('keyword.txt', 'r', encoding='UTF-8') as f:
for line in f.readlines():
keyword = line.strip()
print(keyword)
# replace去除一些和主题无关的neir
# 汉字转GBK码
word = repr("{}".format(keyword).encode("gbk",'ignore')).replace("\\x","%").replace('b\'%',"%").replace("\'","")
# repr() 函数将对象转化为供解释器读取的形式。
print(word)
# 定义将要爬取的URL
url = 'https://zhidao.baidu.com/search?word=' + word + '&ie=gbk&site=-1&sites=0&date=0&pn='
# int里面是要爬取的页码数,可以设为变量
get_more_page(0, int(8) * 10)
keyword.txt为存储关键词的文本文档