# 单线程v2 import requests
from lxml import etree
# 爬取糗事百科classQiubaiSpider:def__init__(self):
self.temp_url = 'https://www.qiushibaike.com/8hr/page/{}/'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36 QQBrowser/4.4.108.400'
}
defgetUrl_list(self):
url_list = [self.temp_url.format(i) for i in range(1,14)]
return url_list
defparse_url(self,url):
resp = requests.get(url,headers=self.headers)
return resp.content.decode()
defget_content_list(self, html_str):
html = etree.HTML(html_str)
div_list = html.xpath('//div[@id="content-left"]/div')
content_list =[]
for div in div_list:
item = {}
text = div.xpath('.//div[@class="content"]/span/text()')
author = div.xpath('.//h2/text()')
# print(text)
item['author'] = author
item['text'] = text
# item['text'] = [i for i in div.xpath('.//div[@class="content"]/span/text()')]
content_list.append(item)
return content_list
defsave_content_list(self,content_list):for content in content_list:
print(content)
defrun(self):
url_list = self.getUrl_list()
for url in url_list:
print(url)
html_str = self.parse_url(url)
content_list= self.get_content_list(html_str)
self.save_content_list(content_list)
print('爬取完成。。。')
if __name__ == '__main__':
qiubai = QiubaiSpider()
qiubai.run()
#author: Zheng #time: 2018/7/11 09:02# 爬取糗事百科import requests
from lxml import etree
classQiubai(object):def__init__(self):
self.tem_url = "https://www.qiushibaike.com/8hr/page/{}/"
self.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
defget_url_list(self):return [self.tem_url.format(i) for i in range(1, 14)]
defparse_url(self,url):#发送请求,获取响应
print(url)
return requests.get(url,headers=self.headers).content.decode()
@staticmethoddefsave_content(content_list):for text in content_list:
f = open('qiubai.txt', 'a')
f.writelines(text)
defget_content(self,html):
html = etree.HTML(html)
div_list = html.xpath("//div[@id='content-left']/div")
self.content_list = []
for div in div_list:
content = {}
content['content'] = div.xpath(".//div[@class='content']/span/text()")
for text in content.values():
self.content_list.append(text)
return self.content_list
defrun(self):
url_list = self.get_url_list()
for url in url_list:
html = self.parse_url(url)
content_list = self.get_content(html)
self.save_content(content_list)
if __name__ == '__main__':
qiubai = Qiubai()
qiubai.run()