1.requsts失败,换了好几个headers都不行,一直418,用的selenium
2.多线程导致是乱序的,懒得排序了
3.窒息,,,之前把user-agent写成user_agent导致一直被拒,所以requests也是可以的
from lxml import etree
from selenium import webdriver
import time
import threading
def dis(itable):
for it in itable:
print(it)
url='https://movie.douban.com/top250'
def cal(url):
#selenium或取代码
browser = webdriver.Edge()
browser.get(url)
time.sleep(5)#等待网页加载完,可根据网速调整
pageSource = browser.page_source
Html=etree.HTML(pageSource)
browser.close()
#筛选信息
titles=Html.xpath('//span[@class="title"]/text()')
titles=[x for x in titles if x.find('/')==-1]
values=Html.xpath('//p[@class=""]/text()')
#爬取的简介分成了两条,合并
values1=[]
for i in range(25):
values1.append(values[2*i]+values[2*i+1])
values=values1
#评分
stars=Html.xpath('//span[@class="rating_num"]/text()')
#评论数不会精确定位,每次都多筛选了一个,去掉
comments=Html.xpath('//div[@class="star"]/span/text()')
comments1=[]
for i in range(25):
comments1.append(comments[2*i+1])
comments=comments1
with open('data3.txt','a',encoding='utf-8') as f:
for title,star,comment,value in zip(titles,stars,comments,values):
f.write(title+'\t'+star+'\t'+comment+'\n')
f.write(value+'\n\n')
para='?start={}&filter='
with open('data3.txt','w',encoding='utf-8') as f:
pass
thread=[]
for i in range(10):
start=i*25
url_now=url+para.format(str(start))
#半吊子多线程
thread.append(threading.Thread(target=cal,args=(url_now,)))
if (i+1)%5==0:
for it in thread:
time.sleep(2)#太快会被封
it.start()
for it in thread:
it.join()
thread.clear()