import bs4
from bs4 import BeautifulSoup as bs
import requests
import re
import docx
from docx.shared import Inches
from urllib.request import urlretrieve
def downpic(url):
urlretrieve(url, 'E:\图.jpg')
r = re.compile(r'(http)://(img)[^\s]*?(.jpg)')
def gettxt(url):
head = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
res = requests.get(url,headers = head , timeout = 500)
res.encoding = "utf-8"
html = res.text
soup = bs(html, 'lxml')
info = soup.find_all("p")
doc = docx.Document()
for i in range(0, len(info)-28):
#print(info[i])
if info[i].string == None:
try:
a = info[i].find("a")['href']
except:
doc.add_paragraph(info[i].text)
try:
picurl = re.search(r, a).group()
downpic(picurl)
doc.add_picture("E:\图.jpg", width=Inches(5.6))
except:
pass
else:
doc.add_paragraph(info[i].text)
doc.save("E:/攻略/"+info[0].text+".docx")
url ="https://www.gamersky.com/handbook/201712/990054_"
for i in range(2,106):
url2 = url + str(i) + ".shtml"
gettxt(url2)
爬虫之游戏攻略
最新推荐文章于 2024-07-31 08:45:00 发布