爬取小说内容

from bs4 import BeautifulSoup
import requests
import time
import lxml
#url="http://www.jinyongwang.com/oyi/"
url="http://www.jinyongwang.com/oyi/1842.html"
#//Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
							'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063'}
def get_html(url):
	r=requests.get(url,headers=headers,timeout=2)
	# print(r.headers)
	print(r.status_code)
	r.encoding=r.apparent_encoding
	return r.content

def get_content(url):
	sp = BeautifulSoup(get_html(url), 'lxml')
	# print(sp.prettify())
	ptag = sp.find_all("li")

	# print(sp.title.string)

	path="yitiantulongji.txt"
	path2 = "%s.txt" % sp.title.string
	mytag = sp.find_all("p")
	with open(path, mode="w", encoding="utf-8")as fp:
		fp.write(sp.title.string)
		for i in mytag:
			for j in i:
				res = str(j) + '\n'
				fp.write(res)

	with open(path, mode="r+", encoding="utf-8")as fp, \
			open(path2, mode="w", encoding="utf-8")as fp1:
		res = fp.readline(60)
		while res:
			fp1.write(res + "\n")
			res = fp.readline(60)
for i in range (120):
	url1=url.replace("1842",str(1842+i))
	print(url1)

	try:
		get_content(url1)
		time.sleep(5)

	except:
		print("erro")
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值