具体的一些文字步骤也不写了,可以直接放在Pytharm中执行
# -*- codeing = utf-8 -*- # @Time :2021/2/2 # @Author :song # @Site : # @File :spider.py # @Software :PyCharm import re import urllib.request import xlwt from bs4 import BeautifulSoup def main(): baseurl = "https://movie.douban.com/top250?start=" datalist = getDate(baseurl) savedate(datalist) # print(datalist) #网址的正则匹配模板 findlist = re.compile(r'<a href="(.*)">') #图片 findImgSrc = re.compile(r'<img.*?src="(.*?)"',re.S) #影片名 findTitle = re.compile(r'<span class="title">(.*?)</span>') #影片评分 findpoint = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>') #影片评价人数 findpople = re.compile(r'<span>(\d*?)人评价</span>') #影片概况 findgai = re.compile(r'<span class="inq">(.*?)</span>') #影片演员主角 findyan = re.compile(r'<p class="">(.*?)</p>',re.S) def getDate(baseurl): datalist = [] for i in range(0,10,1): strs=baseurl+str(i*25) html = askURL(strs) soup = BeautifulSoup(html,"html.parser") for item in soup.find_all ('div',class_="item"): data = [] #print(item) item=str(item) link = re.findall(findlist,item)[0] #影片地址 # link = findlist(item)[1] data.append(link) ImgSrc = re.findall(findImgSrc,item)[0]#图片 data.append(ImgSrc) Title = re.findall(findTitle,item)#电影名字 other = ' ' Chinese = Title[0] if(len(Title)==2):#当有两种名字 other = Title[1] other = re.sub("\xa0/\xa0","",other) data.append(Chinese) data.append(other) point = re.findall(findpoint,item)[0]#评分 data.append(point) pople = re.findall(findpople,item)[0]#人数 data.append(pople) gai = re.findall(findgai,item)#概述,有的影片没有概述 if(len(gai)==0): gai = [' '] data.append(gai[0]) yan = re.findall(findyan,item)[0] yan = re.sub("<br(\s+)?/>(\s+)?","",yan) yan = re.sub("\xa0","",yan) yan = yan.strip() data.append(yan) datalist.append(data) # print(datalist) return datalist def askURL(url): heads = { "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Mobile Safari/537.36" } req = urllib.request.Request(url, headers=heads) html = "" try: response = urllib.request.urlopen(req) html = response.read().decode("utf-8") # print(html) except Exception as a: if hasattr(a,"code"): print(a.code) if hasattr(a,"reason"): print(a.reason) return html def savedate(datalist): wordbook = xlwt.Workbook(encoding="utf-8") wordsheet = wordbook.add_sheet("asd") wordsheet.write(0, 0, "影片地址") wordsheet.write(0, 1, "图片") wordsheet.write(0, 2, "中文名") wordsheet.write(0, 3, "其他名") wordsheet.write(0, 4, "评分") wordsheet.write(0, 5, "评价人数") wordsheet.write(0, 6, "概述") wordsheet.write(0, 7, "演员导演") i=1 for da in datalist: j=0 for xiao in da: wordsheet.write(i,j,xiao) j+=1 i+=1 wordbook.save("影片.xls") if __name__ =="__main__": main()
不同的网站爬取不同,不要直接套取
Python爬取豆瓣250电影信息的脚本代码
最新推荐文章于 2024-05-13 15:07:13 发布