#-*- coding = utf-8 -*-
#@Time : 2020/11/26 16:09
#@Author: QQ群559698073
#@File: 服务器玩法.py
from urllib import request
import urllib
from bs4 import BeautifulSoup
import re
def main():
url = "https://www.9i0i.com/misc.php?mod=tag&id=1663&type=thread&page="
link,title = get_title_link(url)
title_content(link,title)
def get_title_link(url):#获取文章和文章的链接。
findlink = re.compile(r'<a href="(.*?)" target="_blank">')
findtitle = re.compile(r'<a.*target="_blank">(.*?)</a>')
linklist = []
titlelist = []
for i in range(1,4):
linkurl = url + str(i)
html = askhtml(linkurl)
soup = BeautifulSoup(html,"html.parser")
for item in soup.findAll('div',class_='bm_c'):
item = str(item)
link = re.findall(findlink,item)
for i in range(len(link)):
link[i] = "https://www.9i0i.com/" + link[i]
title = re.findall(findtitle,item)
linklist = linklist + link
titlelist = titlelist + title
return linklist,titlelist#已经获取到的文章链接列表
def title_content(linklist,titlelist):#文章内容:读取文章内容的源代码,通过正则表达式来匹配内容。
print("正在爬取。。。")
for i in range(len(linklist)):
cont = askhtml(linklist[i])
soup = BeautifulSoup(cont,"html.parser")
print("正在爬取第"+str(i+1)+"条")
with open('F:\\fwq\\'+ titlelist[i] +".html",'w',encoding='utf-8') as fp:
fp.write(str(soup.findAll("td",class_="t_f")))
def askhtml(url):#获取已经得到的文章列表
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
res = request.Request(url,headers=headers)
html = ' '
try:
resp = request.urlopen(res)
html = resp.read().decode('utf-8')
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
if __name__ == '__main__':
main()
第一个可以完美运行的程序,值得纪念一下!