代码
import pymysql
import requests
from bs4 import BeautifulSoup
import lxml
def createDB():
print("创建数据库doubantop250...")
try:
conn = pymysql.connect(host="localhost",
port=3306,
user="用户名",
password="密码",
charset="utf8")
cur = conn.cursor()
createSql = "create database if not exists doubanTop250 default charset utf8"
cur.execute(createSql)
conn.commit()
print("创建成功")
except pymysql.Error as e:
print("pymysql Error: ", e.args[0], e.args[1])
finally:
cur.close()
conn.close()
def createTable():
print("创建表...")
try:
conn = pymysql.connect(host="localhost",
port=3306,
user="用户名",
password="密码",
db="doubanTop250",
charset="utf8")
cur = conn.cursor()
createSql = "create table movie(" \
" title varchar(50)," \
" score varchar(5)," \
" comment varchar(100)) default character set utf8;"
cur.execute(createSql)
conn.commit()
print("创建成功...")
except pymysql.Error as e:
print("pymysql Error: ", e.args[0], e.args[1])
finally:
cur.close()
conn.close()
def spider():
try:
start = 0
conn = pymysql.connect(host="localhost",
port=3306,
user="用户名",
password="密码",
db="doubantop250",
charset="utf8")
cur = conn.cursor()
sql = "insert into movie (title,score,comment) values ('%s', '%s' ,'%s');"
while start < 250:
print("开始插入第%d部到第%d部电影到数据库中..." % (start, start + 25))
r = requests.get("https://movie.douban.com/top250?start=" + str(start) + "&filter=")
el = BeautifulSoup(r.content, "xml", from_encoding="utf-8")
r.close()
items = el.find_all("div", class_="item")
for item in items:
title = item.find_all(class_="title", limit=1)[0].get_text()
rating_num = item.find_all('span', class_="rating_num", limit=1)[0].get_text()
comment = item.find_all('span', class_="inq", limit=1)
commentStr = ""
if len(comment) > 0:
commentStr = comment[0].get_text().replace("'", " ")
data = (title, rating_num, commentStr)
cur.execute(sql % data)
print("成功插入第%d部到第%d部电影到数据库中" % (start, start+25))
start += 25
conn.commit()
print("完成!!")
except Exception as e:
print("Error: ", e.args[0], e.args[1])
finally:
conn.close()
cur.close()
def main():
createTable()
spider()
if __name__ == '__main__':
main()