问题描述: 2021年7月21日16:5:57 试着爬取了糗事第一页的 (1: 标题;2: 笑的人数; 3 : 评论人数; 4: 作者)
# -*- coding=utf-8 -*-
import urllib.request
from lxml import etree
import json
import xlwt
class EmbarrassmentEncyclopedia(object):
def __init__(self, url):
self.url = url
self.header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
"Cookie": '_qqq_uuid_="2|1:0|10:1626351094|10:_qqq_uuid_|56:ZGI3N2NkN2JhOWIwNGY3YTYwYmFjYWVlMGFkNmQ0Y2FkNmFiZTYyYg==|2646ac3ceb6bd51c14ab695b46705339eb888581145dbdb8d49b67d2bd162877";'
}
# 目标, 拿到, 1: 标题;2: 好笑; 3: 评论数 , 4 作者
def get_html(self):
try:
req = urllib.request.Request(url, headers=self.header)
resp = urllib.request.urlopen(req).read().decode("utf-8")
html = etree.HTML(str(resp))
print("返回类型是: " + str(type(html)))
result = html.xpath('//div[@class="recommend-article"]/ul/li[contains(@class, "item")]')
print(str(len(result)))
if len(result) > 0:
self.get_attributes(result)
else:
print("html.xpath 匹配到的内容为空")
except Exception as e:
print(e)
# 对匹配到的元素进行分割
def get_attributes(self, ele):
try:
print(type(ele)) # 类型
print(ele[0].attrib) # 它里面的属性
# 下面测试拿值
'''
test_title = ele[1].xpath('.//a[contains(@class, "recmd-content")]/text()')
test_laughter_nmb = ele[1].xpath('.//div[contains(@class, "recmd-num")]/span[1]/text()')
test_comments_nmb = ele[1].xpath('.//div[contains(@class, "recmd-num")]/span[4]/text()')
test_auth = ele[1].xpath('.//span[contains(@class, "recmd-name")]/text()')
print("----------")
print(type(test_title))
print(test_title)
print("----------")
print(int(test_laughter_nmb[0]))
print("----------")
print(len(test_comments_nmb))
if len(test_comments_nmb) == 0:
print("没值")
print("----------")
print(test_auth)
'''
# 由上面测试值可知,需要拿的数据用 for 循环 1: 标题;2: 好笑; 3: 评论数 , 4 作者
items = []
for attrib in ele:
embarrassment_title = attrib.xpath('.//a[contains(@class, "recmd-content")]/text()')[0]
embarrassment_laughter_nmb = attrib.xpath('.//div[contains(@class, "recmd-num")]/span[1]/text()') # 可能返回空值
embarrassment_comments_nmb = attrib.xpath('.//div[contains(@class, "recmd-num")]/span[4]/text()') # 可能返回空值
embarrassment_auth = attrib.xpath('.//span[contains(@class, "recmd-name")]/text()')[0]
embarrassment_laughter_nmb = judge_zero(embarrassment_laughter_nmb)
embarrassment_comments_nmb = judge_zero(embarrassment_comments_nmb)
item = {
"title": embarrassment_title,
"laughter_nmb": embarrassment_laughter_nmb,
"comments_nmb": embarrassment_comments_nmb,
"auth": embarrassment_auth
}
items.append(item)
print("添加数据" + str(item) + "完成... ...")
deal_json(items)
except Exception as e:
print(e)
def get_started(self):
print("开始访问目标网站:")
self.get_html()
# 返 0 判断
def judge_zero(nmb):
if len(nmb) == 0:
return 0
else:
return int(nmb[0])
# 处理 JSON 数据
def deal_json(json_data):
print(json_data)
print(type(json_data)) # json_data 是 list 列表
# json.dumps 先把列表转成字符串,再用, json.loads() 把字符串转成 json 数据( dict )
print(type(json.dumps(json_data))) # json.dumps 把它变成 str 类型
data = json.loads(json.dumps(json_data)) # 打印出来 是 list 类型
write_xls(data)
# 写入 xls
def write_xls(json_data):
xls_workbook = xlwt.Workbook()
xls_worksheet = xls_workbook.add_sheet("糗事百科第一页的简单数据")
json_data_len = len(json_data)
xls_style = set_xls_style(xls_worksheet, 4, json_data_len)
titles = ['标题', '笑的人数', '评论人数', '作者']
titles_len = len(titles)
for i in range(titles_len):
xls_worksheet.write(0, i, titles[i], xls_style)
xls_style_align_center = set_xls_center(xls_worksheet)
for j in range(json_data_len):
xls_worksheet.write(j + 1, 0, json_data[j]["title"], xls_style_align_center)
xls_worksheet.write(j + 1, 1, json_data[j]["laughter_nmb"], xls_style_align_center)
xls_worksheet.write(j + 1, 2, json_data[j]['comments_nmb'], xls_style_align_center)
xls_worksheet.write(j + 1, 3, json_data[j]['auth'], xls_style_align_center)
print("数据写入了 " + str(j + 1) + "条了")
xls_workbook.save("./08Qiushi.xlsx")
def set_xls_style(sheet, col_nmb, row):
xls_style = xlwt.XFStyle()
xls_font = xlwt.Font()
xls_font.name = u'仿宋'
xls_font.bold = True
xls_font.height = 240
xls_style.font = xls_font
xls_alignment = xlwt.Alignment()
xls_alignment.horz = xlwt.Alignment.HORZ_CENTER
xls_alignment.vert = xlwt.Alignment.VERT_CENTER
xls_style.alignment = xls_alignment
for i in range(col_nmb + 1):
sheet.col(i).width = 256 * 20
font_height = xlwt.easyxf('font:height 300')
for i in range(row + 1): # 先设置所有的行高,再对第一行的行高进行改变
sheet.row(i).set_style(font_height)
sheet.row(0).set_style(xlwt.easyxf('font:height 480')) # 首行的高度
return xls_style
def set_xls_center(worksheet):
xls_center = xlwt.XFStyle()
xls_alignment = xlwt.Alignment()
xls_alignment.horz = xlwt.Alignment.HORZ_CENTER
xls_alignment.vert = xlwt.Alignment.VERT_CENTER
xls_center.alignment = xls_alignment
return xls_center
if __name__ == '__main__':
url = "https://www.qiushibaike.com/8hr/page/1/"
embarrassments_encyclopedia = EmbarrassmentEncyclopedia(url)
embarrassments_encyclopedia.get_started()
问题解决: