本文实例讲述了Python爬虫实现的根据分类爬取豆瓣电影信息功能。分享给大家供大家参考,具体如下:
代码的入口:if __name__ == '__main__': main()#! /usr/bin/python3# -*- coding:utf-8 -*-# author:Sirius.Zhaoimport jsonfrom urllib.parse import quotefrom urllib.request import urlopenfrom urllib.request import Requestimport pymysqlimport requestsfrom bs4 import BeautifulSoupimport sysimport datetimeimport timefrom imp import reloadimport randomdef LoadUserAgents(uafile): """ uafile : string path to text file of user agents, one per line """ uas = [] with open(uafile, 'rb') as uaf: for ua in uaf.readlines(): if ua: uas.append(ua.strip()[1:-1 - 1]) random.shuffle(uas) return uasuas = LoadUserAgents("user_agents.txt")# s = {}# for i in range(3):# s["key"] = [1,i,]# print(s)# print(s)#所有的电影,去重dict_movies = {}def datetime_to_timestamp_in_milliseconds(d): def current_milli_time(): return int(round(time.time() * 1000)) return current_milli_time()reload(sys)# 通过下面的网址获取分类列表# https://movie.douban.com/chart# 根据分类和比例获取相应的电影# https://movie.douban.com/typerank?type_name=%E5%96%9C%E5%89%A7&type=24&interval_id=100:90#定义一个比例的列表percent_list = ['100:90','90:80','80:70','70:60','60:50','50:40','40:30','30:20','20:10','10:0']#获取分类列表def find_iterm(url): response = urlopen(url) bs = BeautifulSoup(response,'html.pars