#encoding:utf-8 # eg: 以时光网电影排名为例 # author:hou from bs4 import BeautifulSoup import urllib import urlparse import sys reload(sys) sys.setdefaultencoding('utf8') # 获取URL中的参数 def geturlparams(url): query = urlparse.urlparse(url).query return dict([(k, v[0]) for k, v in urlparse.parse_qs(query).items()]) # 配置抓取网页的URL def geturl(year,page,area): url = 'http://movie.mtime.com/boxoffice/?year=%s' % year + \ '&area=%s'%area + \ '&type=MovieRankingYear&category=all&page=%s' % page + \ '&display=list×tamp=1515392838844&version=07bb781100018dd58eafc3b35d42686804c6df8d&dataType=json' return url # 解析HTML并导出所需数据 def export(file,url): # 输出标签dd中的内容 a = urllib.urlopen(url).read() b = BeautifulSoup(a, "html.parser") c = b.find_all('dd') if len(c) > 0: for z in range(0, len(c)): print c[z].i.string print c[z].h3.string print c[z].h4.string record(file,c[z].i.string) record(file, c[z].h3.string) record(file, c[z].h4.string) d = c[z].findAll('p') for e in d: print e.get_text().encode('utf-8') record(file, e.get_text().encode('utf-8')) print "************************************************" record(file, "************************************************") # 记录数据到文件中 def record(file,param): file.write(param + '\r\n') # 网页抓取函数,节约代码 def scraping(file,pagecount,minyear,maxyear,area): for page in range(0, pagecount): for year in range(minyear, maxyear): url = geturl(year, page, area) export(file, url) # # 定义全局参数 # area = ['china','NorthAmerica','global'] # # 定义抓取页数 # pagecount = 1 # # 定义抓取起始年份、终止年份 # minyear = 2016 # maxyear = 2017 # print "开始抓取数据,所需时间较长,请等待……" # # # 在脚本当前目录下创建保存的文件txt # timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S') # filename = 'Mtime film ranking_%s' % timestamp + '.txt' # file_new = open (filename, 'w') # # # 导出中国排名 # record(file_new,"开始抓取内地票房,请等待……") # scraping(file_new,pagecount,minyear,maxyear,area[0]) # record(file_new,"抓取内地票房完成!!!!!") # record(file_new, "************************************************") # # # 导出北美排名 # record(file_new,"开始抓取北美票房,请等待……") # scraping(file_new,pagecount,minyear,maxyear,area[1]) # record(file_new,"抓取北美票房完成!!!!!") # record(file_new, "************************************************") # # # 导出全球排名 # record(file_new,"开始抓取全球票房,请等待……") # scraping(file_new,pagecount,minyear,maxyear,area[2]) # record(file_new,"抓取全球票房完成!!!!!") # record(file_new, "************************************************") # # file_new.close() # print "全部数据已经抓取完成!!!可前去查看生成的文件:%s" % filename print geturlparams('http://movie.mtime.com/boxoffice/?year=2016&area=china&type=MovieRankingYear&category=all&page=0&display=list×tamp=1515573624641&version=07bb781100018dd58eafc3b35d42686804c6df8d&dataType=json')
简易的网络爬虫代码-python
最新推荐文章于 2021-02-05 17:23:08 发布