from pyquery import PyQuery as pq
import requests
import json
import time
from requests.exceptions import RequestException
def parse_one_page(docs):
doc = docs('.board-wrapper').find('dd')
for item in doc.items():
yield {
'index': item.find('.board-index').text(),
'image': item.find('.board-img').attr('data-src'),
'title': item.find('.name').text(),
'actor': item.find('.star').text().strip()[3:],
'time': item.find('.releasetime').text(),
'score': item.find('.score').find('.integer').text().strip() + item.find('.score').find(
'.fraction').text().strip()
}
def write_to_file(item):
with open('test.txt','a',encoding='utf-8') as f:
f.write(json.dumps(item,ensure_ascii=False)+'\n')
def main(offset):
url = 'http://maoyan.com/board/4?offset='+str(offset)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
docs = pq(url, headers=headers)
#doc = docs('.board-wrapper').find('dd')
for item in parse_one_page(docs):
print(item)
write_to_file(item)
if __name__ == '__main__':
for i in range(10):
main(offset=i*10)
time.sleep(1)
利用Python爬虫抓取猫眼电影排行(pyquery方式)
最新推荐文章于 2019-12-16 11:56:45 发布