from bs4 import BeautifulSoup
import requests
import time
import lxml
#url="http://www.jinyongwang.com/oyi/"
url="http://www.jinyongwang.com/oyi/1842.html"
#//Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063'}
def get_html(url):
r=requests.get(url,headers=headers,timeout=2)
# print(r.headers)
print(r.status_code)
r.encoding=r.apparent_encoding
return r.content
def get_content(url):
sp = BeautifulSoup(get_html(url), 'lxml')
# print(sp.prettify())
ptag = sp.find_all("li")
# print(sp.title.string)
path="yitiantulongji.txt"
path2 = "%s.txt" % sp.title.string
mytag = sp.find_all("p")
with open(path, mode="w", encoding="utf-8")as fp:
fp.write(sp.title.string)
for i in mytag:
for j in i:
res = str(j) + '\n'
fp.write(res)
with open(path, mode="r+", encoding="utf-8")as fp, \
open(path2, mode="w", encoding="utf-8")as fp1:
res = fp.readline(60)
while res:
fp1.write(res + "\n")
res = fp.readline(60)
for i in range (120):
url1=url.replace("1842",str(1842+i))
print(url1)
try:
get_content(url1)
time.sleep(5)
except:
print("erro")
爬取小说内容
最新推荐文章于 2023-05-21 21:01:00 发布