import requests
from lxml import etree
class TiebaSpider(object):
def __init__(self, tieba_name): # 初始化需要用到的变量
self.tieba_name = tieba_name
self.url = "http://tieba.baidu.com/mo/q----,sz@320_240-1-3---/m?kw="+tieba_name+"&lp=7202"
self.headers = {"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Mobile Safari/537.36"}
def parse_url(self, url): # 发送请求,响应首页内容
rp = requests.get(url, headers=self.headers)
return rp.content.decode()
def get_content_list(self, html_str): # 清洗数据
html = etree.HTML(html_str)
div_list = html.xpath("//div[contains(@class,'i')]")
href_list = []
for div in div_list:
item = {}
item["title"] = div.xpath("./a/text()") if len(div.xpath("./a/text()"))>0 else None
item["href"] = div.xpath("./a/@href") if len(div.xpath("./a/@href"))>0 else None
href_list.append(item)
return href_list
def save_content(self, href_list):
file_name = self.tieba_name + ".txt"
with open(file_name, "a") as f:
f.write(href_list)
f.write("\n")
def run(self):
# 1,获得开始url
# 2,发送请求获得内容
html_str = self.parse_url(self.url)
href_list = self.get_content_list(html_str)
self.save_content(href_list)
# 3,解析内容
# 4,保存内容
if __name__ == '__main__':
tieba_spider = TiebaSpider("做头发")
tieba_spider.run()
更新中…