声明一哈,有被封ip的危险,我就被封了,可能最开始3,4次可能没得啥子
不过代码没问题, 最好不要去实践
我被封了后又解封了 只要把time.sleep 的时间调长一点,我调成60 就成功了!!
import requests
import re
from lxml import html
import time
#1.设置代理ip和ua
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
proxy={'http': 'http://69.63.170.74:3128'}
#2.在主页面得到详情页面的url
def data_urls(url):
resp = requests.get(url, headers=headers, proxies=proxy).text
#print(resp)
pat='<div class="hd">.*?<a href="(.*?)" class="">'
urls=re.compile(pat,re.S).findall(resp)
# print(urls)
return urls
#3.进入详情页面得到需要的信息
#3.1 得到电影名
def filmname(txt):
names=txt.xpath('//h1/span/text()')
names=''.join(names) #将一个列表变为str
return names
#3.2得到电影导演
def director(txt):
directors=txt.xpath('//div[@id="info"]/span[1]/span[2]/a/text()')
directors=''.join(directors)
return directors
#3.3得到电影评分
def score(txt):
scores=txt.xpath('//div[@class="rating_self clearfix"]/strong/text()')
scores=''.join(scores)
return scores
#3.4得到评论数
def comment(txt):
comments=txt.xpath('//div[@class="rating_sum"]//span/text()')
comments=''.join(comments)
return comments
#3.5因为主演有点多就不写了 就把路径写出来了
def star(txt):
stars=txt.xpath('//span[3]/span/span/a/text()')
#3 汇总
def data(link,f):
resp = requests.get(link, headers=headers, proxies=proxy).text
txt = html.etree.HTML(resp)
names = filmname(txt)
directors = director(txt)
scores = score(txt)
comments = comment(txt)
films = {'电影名': names, '导演': directors, '评分': scores, '评论数': comments, }
print(films)
f.write('{},{},{},{}\n'.format(names,directors,scores,comments))
#4.写入文件 看上一步最后
def main():
with open('top250.csv','a',encoding='utf-8') as f:
for i in range(0, 251, 25):
url = 'https://movie.douban.com/top250?start=' + str(i) + '&filter='
links=data_urls(url,f)
for link in links:
datas=data(link)
print('已成功'+str(i)+'部')
time.sleep(60)
if __name__== '__main__':
main()