python 豆瓣电影top250_python 爬豆瓣电影top250

最新推荐文章于 2024-04-26 18:35:17 发布

HN.K

最新推荐文章于 2024-04-26 18:35:17 发布

阅读量319

点赞数

文章标签： python 豆瓣电影top250

本文链接：https://blog.csdn.net/weixin_29523195/article/details/111981219

版权

基础页面：https://movie.douban.com/top250

代码：

from time import sleep

from requests import get

from bs4 import BeautifulSoup

import re

import pymysql

db = pymysql.connect(host='localhost',

user='root',

password='123456',

db='douban',

charset='utf8mb4',

cursorclass=pymysql.cursors.DictCursor

)

try:

with db.cursor() as cursor:

sql = "CREATE TABLE IF NOT EXISTS `top250` (" \

"`id` int(6) NOT NULL AUTO_INCREMENT," \

"`top` int(6) NOT NULL," \

"`page-code` int(6) NOT NULL," \

"`title` varchar(255) NOT NULL," \

"`origin-title` varchar(255)," \

"`score` float NOT NULL," \

"`theme` varchar(255) NOT NULL," \

"PRIMARY KEY(`id`)" \

") ENGINE=InnoDB DEFAULT CHARSET=utf8 AUTO_INCREMENT=1;"

cursor.execute(sql,)

finally:

db.commit()

base_url = 'https://movie.douban.com/top250'

header = {

'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',

'Accept-Encoding': 'gzip, deflate, br',

'Accept-Language': 'zh-CN,zh;q=0.9',

'Cache-Control': 'max-age=0',

'Connection': 'keep-alive',

'Cookie': 'xxx',

'Host': 'movie.douban.com',

'Referer': 'https://movie.douban.com/chart',

'Upgrade-Insecure-Requests': '1',

'User-Agent': 'xxx'

}

def crawler(url=None, headers=None, delay=1):

r = get(url=url, headers=headers, timeout=3)

soup = BeautifulSoup(r.text, 'html.parser')

page_tag = soup.find('span', attrs={'class': 'thispage'})

page_code = re.compile(r'(.*)').findall(str(page_tag))[0]

movie_ranks = soup.find_all('em', attrs={'class': ''})

movie_titles = soup.find_all('div', attrs={'class': 'hd'})

movie_scores = soup.find_all('span', attrs={'class': 'rating_num'})

movie_themes = soup.find_all('span', attrs={'class': 'inq'})

next_page = soup.find('link', attrs={'rel': 'next'})

for ranks, titles, scores, themes in zip(movie_ranks, movie_titles, movie_scores, movie_themes):

rank = re.compile(r'(.*)').findall(str(ranks))

regex_ts = re.compile(r'(.*)').findall(str(titles))

title = regex_ts[0]

score = re.compile(r'(.*)').findall(str(scores))[0]

theme = re.compile(r'(.*)').findall(str(themes))[0]

try:

origin_title = regex_ts[1]

origin_title = re.compile(r'./.(.+)').findall(origin_title)[0]

with db.cursor() as cursor:

sql = "INSERT INTO `top250` (`top`, `page-code`, `title`, `origin-title`, `score`, `theme`)" \

" VALUES (%s, %s, %s, %s, %s, %s)"

cursor.execute(sql, (rank, page_code, title, origin_title, score, theme,))

except IndexError:

with db.cursor() as cursor:

sql = "INSERT INTO `top250` (`top`, `page-code`, `title`, `score`, `theme`)" \

" VALUES (%s, %s, %s, %s, %s)"

cursor.execute(sql, (rank, page_code, title, score, theme,))

finally:

db.commit()

if next_page is not None:

headers['Referer'] = url

next_url = base_url + re.compile(r'').findall(str(next_page))[0]

sleep(delay)

crawler(url=next_url, headers=headers, delay=3)

crawler(base_url, header, 0)

db.close()

结果：

mysql> select top,title,score from top250 where id = 175;

+-----+--------+-------+

| top | title | score |

+-----+--------+-------+

| 176 | 罗生门 | 8.7 |

+-----+--------+-------+

1 row in set (0.00 sec)

mysql> select top,title,page-code,score from top250 where id = 175;

ERROR 1054 (42S22): Unknown column 'page' in 'field list'

mysql> select top,page-code,title,score from top250 where id = 175;

ERROR 1054 (42S22): Unknown column 'page' in 'field list'

mysql> select page-code from top250 where id = 175;

ERROR 1054 (42S22): Unknown column 'page' in 'field list'

mysql> describe top250

-> ;

+--------------+--------------+------+-----+---------+----------------+

+--------------+--------------+------+-----+---------+----------------+

| id | int(6) | NO | PRI | NULL | auto_increment |

| top | int(6) | NO | | NULL | |

| page-code | int(6) | NO | | NULL | |

+--------------+--------------+------+-----+---------+----------------+

7 rows in set (0.32 sec)

mysql> select page-code from top250 where id = 175;

ERROR 1054 (42S22): Unknown column 'page' in 'field list'

mysql> select origin-title from top250 where id = 175;

ERROR 1054 (42S22): Unknown column 'origin' in 'field list'

mysql> select origin_title from top250 where id = 175;

ERROR 1054 (42S22): Unknown column 'origin_title' in 'field list'

mysql> select * from top250 where id = 175;

+-----+-----+-----------+--------+--------------+-------+-------------------+

+-----+-----+-----------+--------+--------------+-------+-------------------+

| 175 | 176 | 8 | 罗生门 | 羅生門 | 8.7 | 人生的N种可能性。 |

+-----+-----+-----------+--------+--------------+-------+-------------------+

1 row in set (0.00 sec)

mysql> select * from top250 where title = 未麻的部屋;

ERROR 1054 (42S22): Unknown column '未麻的部屋' in 'where clause'

mysql> select * from top250 where top=175;

Empty set (0.00 sec)

mysql>

两个小问题：

1.没想到数据库字段不能用'-'...，于是page-code字段与origin-title字段不能独立进行查找。。。

2.不知道为啥top175的电影《未麻的部屋》没爬到。。。

HN.K

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python 豆瓣电影top250_python 爬豆瓣电影top250

基础页面：https://movie.douban.com/top250代码：from time import sleepfrom requests import getfrom bs4 import BeautifulSoupimport reimport pymysqldb = pymysql.connect(host='localhost',user='root',password='123...
复制链接

扫一扫

python 豆瓣电影top250_python 爬豆瓣电影top250

“相关推荐”对你有帮助么？