爬出来始终是第一页的内容
import requests
from requests.exceptions import RequestException
from pyquery import PyQuery as pq
import re
import simplejson as json
def get_one_page(url):
try:
response=requests.get(url)
if response.status_code==200:
return response.text
except RequestException:
return None
def parse_one_page(html):
doc = pq(html)
movie=doc('.video_item .sort_lst_tit').text()
actor=doc('.video_item .sort_lst_txt')
actor_list=[]
for items in actor.items():
# 遍历获取到的pyquery对象
item_i="".join(((items.text()).strip()[3:])).split()
item_ii=",".join(item_i)
#去掉空格,并传入list中
actor_list.append(item_ii)
movie_list=movie.split(' ')
# 将电影列表整理成列表
for item in movie_list:
if item=='(2017)':
movie_list.remove('(2017)')
if item=='厉害了,我的国':
actor_list.insert((movie_list.index('厉害了,我的国')),'央视',)
for i in range(len(actor_list)):
#生成可迭代对象
yield {
'movie':movie_list[i],
'actor':actor_list[i]
}
def write_to_file(content):
with open('result.txt','a',encoding='utf-8')as f:
f.write(json.dumps(content,ensure_ascii=False) + '\n')
f.close()
def main(num):
url='http://tv.sogou.com/film/list/style-%E5%96%9C%E5%89%A7+page-'+str(num)
html=get_one_page(url)
parse_one_page(html)
for item in parse_one_page(html):
print(item)
write_to_file(item)
if __name__=='__main__':
for i in range(1,10):
main(i)