爬虫小案例 爬取笑话 xpath

 

import requests
from lxml import etree


# 写入文件
def write_file(art):
	with open("笑话.txt", "a+", encoding="utf-8") as f:
		f.write(art)


# 解析html得到自己想要的内容
def parse_html(html):
	content = etree.HTML(html)
	a_lists = content.xpath('//div[@class="list_title"]/ul/li/b/a/@href')
	for a in a_lists:
		# # "http://www.jokeji.cn/jokehtml/%E5%86%B7%E7%AC%91%E8%AF%9D/201806212319307.htm"
		url = "http://www.jokeji.cn" + a
		result = requests.get(url)
		# 转化成gb2312编码
		result.encoding = "gb2312"
		result = result.text
		info = etree.HTML(result)
		art_lists = info.xpath('//span[@id="text110"]/p/text()')
		for art in art_lists:
			print(art)
			write_file(art)


def main():
	num = 1
	for i in range(10):
		url = "http://www.jokeji.cn/list_" + str(num) + ".htm"
		num += 1
		headers = {
			"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
		}
		html = requests.get(url, headers=headers)
		# 如果不知道是什么编码 此时可以print(html.encoding)查看一下是什么编码
		html.encoding = "gb2312"
		html = html.text
		parse_html(html)


if __name__ == '__main__':
	main()

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值