(新手Python)豆瓣输入电影名后自动提取昵称,时间以及评论内容,后保存
#import re
import time
import requests
import urllib.parse as up
from lxml import etree
from selenium import webdriver
headers={“User-Agent”:“Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36”}
a=input(“请输入电影名称:”)
b=input(“请输入查询页数:”)
t=up.quote(a)
url1=“https://search.douban.com/movie/subject_search?search_text=”+t+"&cat=1002"
driver=webdriver.Chrome()
sousuoye=driver.get(url1)
url2=driver.find_element_by_xpath(’//[@id=“root”]/div/div[2]/div[1]/div[1]/div[1]/div[1]/a’).get_attribute(‘href’) #xpath提取不到href对应的属性链接,所以借助SELENIUM然后通过get_attribute提取到
f=open(“E:/shuju.doc”,“w”,encoding=“utf-8”)
for i in range(0,int(b)+1):
urls=url2+"/comments?start="+str(i20)+"&limit=20&sort=new_score&status=P"
r=requests.get(urls,headers=headers).text
page=etree.HTML®
#昵称
names=page.xpath(’//div[@class=“avatar”]/a/@title’)
#时间
time=page.xpath(’//span[@class="comment-time "]/@title’)
#评论
comments=page.xpath(’//span[@class=“short”]/text()’)
for j in range(len(comments)):
data=names[j]+’ ‘+time[j]+’ ‘+comments[j]
f.write(data)
print(data)
print(’\n’)
print(“完成By1192106506”)
f.close()
#正则第一页可以,,后面不行,源代码中无显示
‘’’
pat1= ‘(.?)’
pat2= '<a title="(.?)" href=’
pat3=’’
names=re.compile(pat2).findall®
times=re.compile(pat3).findall®
rsts=re.compile(pat1).findall®
for j in range(len(times)):
print(names[j],’ ‘,times[j],’ ‘,rsts[j])
print(’\n’)
‘’’