#爬取豆瓣top250电影,并保存到数据库
import requests
from bs4 import BeautifulSoup
import sqlite3
def get_html(web_url):
user_agent = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19'
headers = {'User-agent': user_agent}
response = requests.get(web_url, headers=headers)
res = BeautifulSoup(response.text,'html.parser')
content = res.find('ol', attrs={'class': 'grid_view'})
return content
def get_title(content):
film = content.find_all('li')
conn = sqlite3.connect('file.sqlite3')
cursor = conn.cursor()
# f = open("F:\\Pythontest1\\douban.txt", "a")
for items in film:
rank = items.find('div',attrs = {'class','pic'}).text
title = items.find('span', attrs={'class', 'title'}).text
rate = items.find('span', attrs={'class', 'rating_num'}).text
autor = items.find('p').get_text()
member = autor.replace(' ','').replace('\n','').replace('\xa0','').replace('\xee','').replace('\xf6','').replace('\u0161','').replace('\xf4','').replace('\xfb','').replace('\xe5','').replace('\u22ef','')
quote = items.find('p',attrs = {'class','quote'})
if quote is None:
quote = '无评语'
else:
quote = quote.get_text().replace('\n','').replace('\u22ef','')
sql = 'insert into filmtop250 values (?,?,?,?,?,?)'
cursor.execute(sql,(rank,title,rate,autor,member,quote))
conn.commit()
cursor.close()
conn.close()
if __name__ == '__main__':
for i in range(0,250,25):
web_url = 'https://movie.douban.com/top250?start={}&filter='
web_url = web_url.format(i)
# print(web_url)
content = get_html(web_url)
# print(content)
titles = get_title(content)
1.注意,sqlite3中insert into的占位符为?。
2.首先用pycharm的terminal端创建数据库file.sqilte3,表filmtop250。
3.注意数据库关闭的位置,即cursor.close()和conn.close()的位置。