from bs4 import BeautifulSoup as bs
import requests,json,time
file = open("duanzi.json", "a")
def crawl_html(num):
headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:69.0) Gecko/20100101 Firefox/69.0"}
html=requests.get("http://www.lovehhy.net/Joke/Detail/QSBK/"+str(num),headers=headers)
html.encoding='gbk'
return html.text
def parse_html(html):
content=[]
html=bs(html,'lxml')
x=html.find('div',id='footzoon')
text=x.find_all(id="endtext")
title=x.find_all(target=True)
time=[i for i in x.contents if i not in x.find_all()]
for i in range(len(time)):
content.append({'title':title[i],'time':time[i],'text':text[i]})
return content
def save_html(data):
with open('duanzi.json','ab+') as file:
file.write(str(data).encode())
def main():
for num in range(1, 5):
html=crawl_html(num)
data=parse_html(html)
save_html(data)
if __name__=='__main__':
main()
python之爬取笑话
最新推荐文章于 2022-12-14 09:33:59 发布