下面我们把数据存放到数据库中。
如果我们想要在pysql的返回结果里面使用dict类型。
config里面可以写上如下配置
cursorclass = pymysql.cursors.DictCursor
import json
import urllib.request
import urllib.parse
import pymysql
class DoubanMovies(object):
movies = {
}
config = {
"host": "127.0.0.1",
"port": 3306,
"user": "root",
"password": "*******",
"db": "douban_movie",
"charset": "utf8", # not utf-8 take care 使用中文的话,这个要添加进来
}
def __init__(self, base_url):
self.base_url = base_url
# json数据下载
def download(self, start=0):
post_data = {
"start": start,
}
post_data = urllib.parse.urlencode(post_data)
post_data = post_data.encode("utf-8")
print(post_data)
request = urllib.request.Request(self.base_url, post_data)
# download movies
response = urllib.request.urlopen(request)
return response.read().decode("utf-8")
# json数据解析
def parse(self, data):
data = json.loads(data)
self.movies = data["subjects"]
# json数据存放
def store(self):
connector = pymysql.connect(**self.config)
cursor = connector.cursor()
for movie in self.movies:
title = bytes(movie["title"], encoding="utf-8").decode("utf-8")
original_title = movie["original_title"]
genres = " ".join(movie["genres"])
douban_link = movie["alt"]
image_large = movie["images"]["large"]
image_medium = movie["images"]["medium"]
image_small = movie["images"]["small"]
sql = "insert into movies(title, original_title, genres, douban_link, " \
"image_large, image_medium, image_small) " \
'values("%s", "%s" ,"%s","%s", "%s", "%s", "%s")' % (title, original_title, genres, douban_link,
image_large, image_medium, image_small)
cursor.execute(sql)
connector.commit()
cursor.close()
connector.close()
if __name__ == "__main__":
movies_obj = DoubanMovies("https://api.douban.com/v2/movie/top250")
for i in [i * 10 for i in range(25) if i % 2 == 0]:
json_data = movies_obj.download(start=i)
movies_obj.parse(json_data)
movies_obj.store()
print("finished")
其中数据库部分写的比较粗糙,有些字段操作起来很费力。以后慢慢完善。
暂时也不分表了。希望更多的时间花在去电影天堂抓取ftp链接上。
create table movies(id int auto_increment primary key,
title varchar(50) ,
original_title varchar(100),
directors varchar(30),
casts varchar(60),
rating int,
genres varchar(100),
douban_id varchar(20),
image_large varchar(150),
image_medium varchar(150),
image_small varchar(150),
movie_year varchar(10),
stars int,
subtype varchar(30),
douban_link varchar(50)
);