#-*- coding = utf-8 -*-
#@Time : 2020/9/11 17:01
#@Author: wang
#@File : douBanTop250.py
#@software : PyCharm
import urllib.request,urllib.error
from bs4 import BeautifulSoup
import re
import xlwt
def main():
datalist=[]
url="https://movie.douban.com/top250?start="
find_pic=re.compile(r'<img.*src="(.*?)"',re.S) #s去重
find_name=re.compile(r'<span class="title">(.*)</span>')
find_score=re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
find_content=re.compile(r'<p class="">(.*?)</p>',re.S)
for i in range(0,10):
page=i*25
urls=url+str(page)
html=askUrl(urls)
# print(html)
soup=BeautifulSoup(html,"html.parser")
for item in soup.findAll('div',class_="item"):
data=[]
str_item=str(item)
# print(item)
pic=re.findall(find_pic,str_item)[0]
# print(pic)
data.append(pic)
# print(data)
name=re.findall(find_name,str_item)[0]
# print(name)
data.append(name)
score=re.findall(find_score,str_item)[0]
data.append(score)
# saveData(data)
# print(data)
datalist.append(data)
saveData(datalist)
# print(datalist)
# workbook=xlwt.Workbook(encoding="utf-8")
# sheet=workbook.add_sheet("doubanTop250")
# col_name=["link","name","score"]
# for i in range(0,3):
# sheet.write(1,i,data[i])
# workbook.save("豆瓣")
def num_split(word,n):
return [word[i:i + n] for i in range(0, len(word), n)]
def saveData(datalist):
# print(datalist)
# [['https://img3.doubanio.com/view/photo/s_ratio_poster/public/p480747492.jpg', '肖申克的救赎', '9.7'], ['https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2561716440.jpg', '霸王别姬', '9.6']]
workbook=xlwt.Workbook(encoding="utf-8")
sheet=workbook.add_sheet("DouBanTop250")
col_name=["link","name","score"]
for i in range(0,3):
sheet.write(0,i,col_name[i])
for x in range(len(datalist)):
single_list=datalist[x]
# print(single_list)
sheet.write(x + 1, 0, single_list[0])
sheet.write(x + 1, 1, single_list[1])
sheet.write(x + 1, 2, single_list[2])
workbook.save("豆瓣")
def askUrl(urls): #访问html
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36"}
mock_headers=urllib.request.Request(urls,headers=headers)
response=urllib.request.urlopen(mock_headers)
html=response.read().decode("utf-8")
return html
if __name__ == "__main__":
main()
豆瓣Top250脚本爬虫
最新推荐文章于 2021-10-22 00:32:18 发布