server=[]for i inrange(10):
url="https://maoyan.com/board/4?offset="+str(i*10)
cat=requests.get(url,headers=headers,proxies ={"http":"58.253.157.136"})
cat_text=cat.text
server.append(cat.text)
建立空表
maoyan=pd.DataFrame()
信息提取
for i in server:
data=etree.HTML(i)
pos=data.xpath('//dl[@class="board-wrapper"]')for j in pos:
ranking=j.xpath('//*[@id="app"]/div/div/div[1]/dl/dd/i/text()')
picture=j.xpath('.//a/img/@data-src')
movie_web=j.xpath('.//div/div[1]/div[1]/p[1]/a/@href')
movie_name=j.xpath('//*[@id="app"]/div/div/div[1]/dl/dd/div/div/div[1]/p[1]/a/text()')
performer=j.xpath('//*[@id="app"]/div/div/div[1]/dl/dd/div/div/div[1]/p[2]/text()')
Release_date=j.xpath('//*[@id="app"]/div/div/div[1]/dl/dd/div/div/div[1]/p[3]/text()')
grades1=j.xpath('//*[@id="app"]/div/div/div[1]/dl/dd/div/div/div[2]/p/i[1]/text()')
grades2=j.xpath('//*[@id="app"]/div/div/div[1]/dl/dd/div/div/div[2]/p/i[2]/text()')
score=[]for h inrange(0,len(grades1)):
score.append(grades1[h]+grades2[h])
result=pd.DataFrame({"电影排名":ranking,"图片地址":picture,"电影详情页地址":movie_web,"电影名称":movie_name,"电影主演":performer,"首映时间":Release_date,"评分":score})
maoyan=maoyan.append(result,ignore_index=True)