爬取百度贴吧的标题,发帖人,发帖时间
import re
import requests
def get(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
# 发送请求
request = requests.get(url=url, headers=headers)
# 返回html源代码
html = request.text
# 标题
titel = re.compile('"j_th_tit ">(.*)</a>')
titles = re.findall(titel, html)
# 发帖人
author = re.compile('"主题作者: (.*)"')
authors = re.findall(author, html)
# 发帖时间
time = re.compile('时间">(.*)</span>')
times = re.findall(time, html)
with open("作业.csv", 'a', encoding='utf-8') as f:
# csv文件第一行标题
f.write("标题,发帖人,发帖时间\n")
# for循环写入文件
for i in range(len(titles)):
f.write(f'"{titles[i]}"'+","+authors[i]+","+times[i]+"\n")
print("ok")
url = f"https://tieba.baidu.com/f?kw=%E7%88%AC%E8%99%AB&ie=utf-8&pn=50"
get(url)
结果图