#-*- coding:utf-8 -*-
import requests
from lxml import etree
import json
class Qiushi:
def __init__(self):
self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKi"
"t/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
def get_url_list(self):
url_list = []
url = 'https://www.qiushibaike.com/8hr/page/{}/'
for i in range(1,14):
url_list.append(url.format(i))
return url_list
def parse_url(self,url):
r = requests.get(url,self.headers)
resp = r.content.decode()
return resp
def parse_content(self,resp):
html = etree.HTML(resp)
div_list = html.xpath('//div[@id="content-left"]/div')
content_list = []
for div in div_list:
dict = {}
content = div.xpath('.//div[@class="content"]/span/text()')
dict['content'] = [i.replace('\n','') for i in content]
dict['img'] = 'https:' + div.xpath('.//div[@class="thumb"]/a/img/@src')[0] if len(div.xpath('.//div[@class="thumb"]/a/img/@src') ) > 0 else None
dict['head_img'] = 'https:' + div.xpath('.//div[@class="author clearfix"]//a/img/@src')[0] if len(div.xpath('.//div[@class="author clearfix"]//a/img/@src'))>0 else None
dict['user_name'] = div.xpath('.//div[@class="author clearfix"]/a/h2/text()')[0].replace('\n','') if len(div.xpath('.//div[@class="author clearfix"]/a/h2/text()'))>0 else None
dict['age'] = div.xpath('.//div[@class="author clearfix"]/div/text()')[0] if len(div.xpath('.//div[@class="author clearfix"]/div/text()') ) >0 else None
sex = div.xpath('.//div[@class="author clearfix"]/div/@class')[0] if len(div.xpath('.//div[@class="author clearfix"]/div/@class')) >0 else None
if sex:
sex1 = sex.split(' ')[-1].split('Icon')[0]
dict['sex'] = sex1
else:
dict['sex'] = None
content_list.append(dict)
return content_list
def save_content(self,content_list):
with open('糗事百科.txt','a') as f:
for content in content_list:
f.write(json.dumps(content,ensure_ascii=False))
f.write('\n')
def run(self):
# 1,构造url列表
url_list = self.get_url_list()
# 2,发送请求,获取响应
for url in url_list:
print (url)
resp = self.parse_url(url)
# 3,提取数据
content_list = self.parse_content(resp)
# 4,保存数据
self.save_content(content_list)
print ('保存{}成功'.format(url))
# 5,循环
if __name__ == '__main__':
qiushi = Qiushi()
qiushi.run()
import requests
from lxml import etree
import json
class Qiushi:
def __init__(self):
self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKi"
"t/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
def get_url_list(self):
url_list = []
url = 'https://www.qiushibaike.com/8hr/page/{}/'
for i in range(1,14):
url_list.append(url.format(i))
return url_list
def parse_url(self,url):
r = requests.get(url,self.headers)
resp = r.content.decode()
return resp
def parse_content(self,resp):
html = etree.HTML(resp)
div_list = html.xpath('//div[@id="content-left"]/div')
content_list = []
for div in div_list:
dict = {}
content = div.xpath('.//div[@class="content"]/span/text()')
dict['content'] = [i.replace('\n','') for i in content]
dict['img'] = 'https:' + div.xpath('.//div[@class="thumb"]/a/img/@src')[0] if len(div.xpath('.//div[@class="thumb"]/a/img/@src') ) > 0 else None
dict['head_img'] = 'https:' + div.xpath('.//div[@class="author clearfix"]//a/img/@src')[0] if len(div.xpath('.//div[@class="author clearfix"]//a/img/@src'))>0 else None
dict['user_name'] = div.xpath('.//div[@class="author clearfix"]/a/h2/text()')[0].replace('\n','') if len(div.xpath('.//div[@class="author clearfix"]/a/h2/text()'))>0 else None
dict['age'] = div.xpath('.//div[@class="author clearfix"]/div/text()')[0] if len(div.xpath('.//div[@class="author clearfix"]/div/text()') ) >0 else None
sex = div.xpath('.//div[@class="author clearfix"]/div/@class')[0] if len(div.xpath('.//div[@class="author clearfix"]/div/@class')) >0 else None
if sex:
sex1 = sex.split(' ')[-1].split('Icon')[0]
dict['sex'] = sex1
else:
dict['sex'] = None
content_list.append(dict)
return content_list
def save_content(self,content_list):
with open('糗事百科.txt','a') as f:
for content in content_list:
f.write(json.dumps(content,ensure_ascii=False))
f.write('\n')
def run(self):
# 1,构造url列表
url_list = self.get_url_list()
# 2,发送请求,获取响应
for url in url_list:
print (url)
resp = self.parse_url(url)
# 3,提取数据
content_list = self.parse_content(resp)
# 4,保存数据
self.save_content(content_list)
print ('保存{}成功'.format(url))
# 5,循环
if __name__ == '__main__':
qiushi = Qiushi()
qiushi.run()