import requests
from lxml import etree
import json
import time
class QSBK(object):
# 初始化请求头,以及需要储存的文件名
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
}
self.file = open('qiushibaike.json', 'w')
self.file.write('[' + '\n')
# 获取url内容
def get_content(self, url):
response = requests.get(url, headers=self.headers)
return response.text
# 解析&存储
def deal_content(self, content):
lists = etree.HTML(content).xpath('//div[contains(@id,"qiushi_tag_")]')
for list in lists:
item = {}
content = list.xpath('.//div[@class="content"]/span/text()')
item['content'] = "".join(content).strip()
item['vote'] = list.xpath('.//span[@class="stats-vote"]/i/text()')[0]
item['comments'] = list.xpath('.//span[@class="stats-comments"]/a/i/text()')[0]
try:
item['picture'] = list.xpath('.//div[@class="thumb"]/a/img/@src')[0][2:]
except:
item['picture'] = ''
json_text = json.dumps(dict(item), ensure_ascii=False) + ',' + '\n'
self.file.write(json_text)
# 完成url的全部操作后,关闭文件
def close(self):
self.file.write('{}' + '\n' + ']')
self.file.close()
if __name__ == '__main__':
t1 = time.time()
url = 'https://www.qiushibaike.com/8hr/page/'
qsbk = QSBK()
# 此处获取的糗事百科8小时栏目总共就有13页
for page in range(13):
content = qsbk.get_content(url + str(page+1) + '/')
qsbk.deal_content(content)
qsbk.close()
t2 = time.time()
print('use time is : %s 秒' % str(t2 - t1))
print('func is over.')