python基础阶段项目练习:
1、写一个网络爬虫程序
2、爬取目标网站数据,关键项不能少于5项。
3、存储数据到数据库,可以进行增删改查操作。
4、扩展:将库中数据进行可视化展示。
import requests import re import xlwt from bs4 import BeautifulSoup url = "https://www.dygod.net/html/gndy/dyzz/" hd = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188' } def getmanget(linkurl): res = requests.get(linkurl, headers=hd) res.encoding = res.apparent_encoding soup = BeautifulSoup(res.text, "html.parser") ret = soup.find_all("a") for n in ret: if "magnet" in str(n.string): return n.string return None def insertDB(): pass def saveExcel(worksheet,count,lst): for i in range(6): worksheet.write(count,i,lst[i]) count = 0 total = [] workbook = xlwt.Workbook(encoding="utf-8") worksheet = workbook.add_sheet('sheet1') for i in range(2,3): url = "https://www.dygod.net/html/gndy/dyzz/index_" + str(i) + ".html" # print(url) res = requests.get(url,headers=hd) res.encoding = res.apparent_encoding # print(res.text) soup = BeautifulSoup(res.text,"html.parser") # print(soup,title,type(soup.title)) ret = soup.find_all(class_="tbspan",style="margin-top:6px") for x in ret: info = [] # print(x.find("a").string) info.append(x.find("a").string) pat = re.compile(r"◎译 名(.*)\n") ret = re.findall(pat, str(x)) for n in ret: n = n.replace(u' u3000',u'') print("◎译 名:" ,n) info.append(str(n).split("/")[0]) pat =re.compile(r"年 代(.*)\n") ret = re.findall(pat,str(x)) for n in ret: n = n.replace(u'\u3000', u'') print("◎年 代:" ,n) info.append(str(n)) pat = re.compile(r"◎产 地(.*)\n") ret = re.findall(pat, str(x)) for n in ret: n = n.replace(u'\u3000', u'') # print("◎产 地:", n) info.append(str(n).split("/")[0]) pat = re.compile(r"◎类 别(.*)\n") ret = re.findall(pat, str(x)) for n in ret: n = n.replace(u'\u3000', u'') # print("◎类 别:",str(n).split("/")[0]) info.append(str(n).split("/")[0]) linkurl = "https://www.dygod.net/" + x.find("a").get("href") manget = getmanget(linkurl) if manget: info.append(str(manget)) saveExcel(worksheet,count,info) count += 1 print("="*100) workbook.save("movie,xls") print(count)