本案例介绍从JavaScript中采集加载的数据。更多内容请参考:Python学习指南
#-*- coding:utf-8 -*-
import requests
import re
import time
import json
#数据下载器
class HtmlDownloader(object):
def download(self, url, params=None):
if url is None:
return None
user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:48.0) Gecko/20100101 Firefox/48.0'
headers = {'User-Agent':user_agent}
if params is None:
r = requests.get(url, headers = headers)
else:
r = requests.get(url, headers = headers, params = params)
if r.status_code == 200:
r.encoding = 'utf-8'
return r.text
return None
#数据存储器
class HtmlParser(object):
#从选购电影页面中解析出所有电影信息,组成一个list
def parser_url(self, page_url, response):
pattern = re.compile(r'(http://movie.mtime.com/(\d+)/)')
urls = pattern.findall(response)
if urls != None:
#将urls去重
return list(set(urls))
else:
return None
#解析正在上映的电影
def __parser_release(self, page_url, value):
'''
解析已经上映的电影
:param page_url:电影链接
:param value: json数据
:return
'''
try:
isRelease = 1
movieRating = value.get('value').get('movieRating')
boxOffice = value.get('value').get('boxOffice')
movieTitle = value.get('value').get('movieTitle')
RPictureFinal = movieRating.get('RPictureFinal')
RStoryFinal = movieRating.get('RStoryFinal')
RDirectorFinal = movieRating.get('RDirectorFinal