import requests
from lxml import etree
class tieba(object):
def __init__(self,keyword):
self.keyword = keyword
self.url = "https://tieba.baidu.com/f?ie=utf-8&kw={}".format(keyword)
self.headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
}
print(self.url)
self.run()
def get_data(self,url):
# 获取数据
response = requests.get(url, headers=self.headers)
return response.content
def parse_list_data(self,data):
# 实际贴吧网页会有注释,导致拿不到数据
data = data.decode().replace('<!--','').replace('-->','')
# 数据转为element对象
html = etree.HTML(data)
ele_list = html.xpath('//div[contains(@class,"threadlist_title")]/a[@rel="noopener"]')
print(len(ele_list))
data_list = []
# 获取文章数据
for ele in ele_list:
temp_dict = {}
temp_dict['title'] = ele.xpath('./text()')[0]
temp_dict['link'] = 'https://tieba.baidu.com' + ele.xpath('./@href')[0]
data_list.append(temp_dict)
# 获取下一页链接,如若没有下一页链接则返回None
try:
next_page_url = 'https:' + html.xpath('//a[contains(text(),"下一页>")]/@href')[0]
except:
next_page_url = None
# 返回数据和链接
return data_list,next_page_url
def run(self):
next_page_url = self.url
while next_page_url:
page_list_data = self.get_data(next_page_url)
temp_page_url = next_page_url
data_list,next_page_url = self.parse_list_data(page_list_data)
print(next_page_url)
for data in data_list:
with open('tieba.txt','a',encoding='utf8') as f:
str = ('title:'+ data['title'] + "\t" + 'link:' + data['link'] + "\t" + 'page_url:' + temp_page_url + "\n")
f.write(str)
if __name__ == '__main__':
tieba = tieba("李毅")
tieba.run()