:start: 开始的记录数,0-480
‘’’
self.movie_id = movie_id
self.start = start
self.type = type
self.url = ‘https://movie.douban.com/subject/{id}/comments?start={start}&limit=20&sort=new_score&status=P&percent_type={type}&comments_only=1’.format(
id=str(self.movie_id),
start=str(self.start),
type=self.type
)
#创建数据库连接
self.session = create_session()
#随机useragent
def _random_UA(self):
self.headers[‘User-Agent’] = random.choice(USERAGENT)
#获取api接口,使用get方法,返回的数据为json数据,需要提取里面的HTML
def _get(self):
self._random_UA()
res = ‘’
try:
res = requests.get(self.url, cookies=self.cookies, headers=self.headers)
res = res.json()[‘html’]
except Exception as e:
print(‘IP被封,请使用代理IP’)
print(‘正在获取{} 开始的记录’.format(self.start))
return res
def _parse(self):
res = self._get()
dom = etree.HTML(res)
#id号
self.id = dom.xpath(self.base_node + ‘/@data-cid’)
#用户名
self.username = dom.xpath(self.base_node + ‘/div[@class=“avatar”]/a/@title’)
#用户连接
self.user_center = dom.xpath(self.base