爬虫爬取网站:https://www.dygod.net/html/gndy/dyzz/
源码:
import requests import re import xlwt from bs4 import BeautifulSoup url = "https://www.dygod.net/html/gndy/dyzz/" hd = { # 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' 'User-Agent': 'Mozilla/4.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36' } def getmanget(linkurl): res = requests.get(linkurl,headers=hd) res.encoding = res.apparent_encoding soup = BeautifulSoup(res.text,"html.parser") ret = soup.find_all("a") for n in ret: if"magnet" in str(n.string): return n.string return None def insertDB(): pass def saveExcel(worksheet,count,lst): for i in range(6): worksheet.write(count,i,lst[i]) count = 0 total = [] workbook = xlwt.Workbook(encoding="utf-8") workbook = workbook.add_sheet('sheet1') for i in range(2,3): url = "https://www.dygod.net/html/gndy/dyzz/index_"+str(i)+".html" # print(url) # exit(0) res = requests.get(url,headers=hd) res.encoding = res.apparent_encoding soup = BeautifulSoup(res.text,"html.parser") ret = soup.find_all(class_="tbspan",style="margin-top:6px") for x in ret: info = [] info.append(x.find("a").string) pat = re.compile(r"译 名(.*)\n") ret = re.findall(pat, str(x)) for n in ret: n = n.replace(u'\u3000',u'') print("译 名:",n) info.append(str(n).split("/")[0]) pat = re.compile(r"年 代(.*)\n") ret =re.findall(pat,str(x)) for n in ret: n = n.replace(u'\u3000',u'') print("年 代:",n) info.append(str(n)) pat = re.compile(r"产 地(.*)\n") ret =re.findall(pat,str(x)) for n in ret: n = n.replace(u'\u3000',u'') print(r"产 地(.*)\n") info.append(str(n).split("/")[0]) pat = re.compile(r"类 别(.*)\n") ret = re.findall(pat, str(x)) for n in ret: n = n.replace(u'\u3000', u'') print(r"类 别:",str(n).split("/")[0]) info.append(str(n).split("/")[0]) linkurl = "https://www.dygod.net/"+x.find("a").get("href") manget = getmanget(linkurl) if manget: print("下载地址:",manget) info.append(str(manget))
爬取结果: