爬取猫眼电影榜单Top100

废话不多说,直接上代码

import json
import requests
from requests.exceptions import RequestException
import re

def get_one_page(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

def parse_one_page(html):
    pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a.*?>(.*?)'
                         '</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>'
                         '.*?fraction">(.*?)</i>.*?</dd>',re.S)
    items = re.findall(pattern,html)
    for item in items:
        yield{
            'index':item[0],
            'image':item[1],
            'name':item[2],
            'actor':item[3].strip()[3:],
            'time':item[4].strip()[5:],
            'score':item[5]+item[6]
        }

def write_to_file(content):#将爬取到的电影信息保存到本地文件夹
    with open('result.txt','a',encoding='utf-8') as f:#将编码格式设置为utf-8
        f.write(json.dumps(content,ensure_ascii=False)+'\n')
        f.close()

def main(offset):
    url = 'http://maoyan.com/board/4?offset='+str(offset)
    html = get_one_page(url)
    for item in parse_one_page(html):
        print(item)
        write_to_file(item)

if __name__ == '__main__':
    for i in range(10):
        main(i*10)
好的,我明白了。首先,你需要安装相关的库: ```python pip install scrapy pymysql ``` 接着,在`settings.py`中配置MySQL数据库连接信息: ```python MYSQL_HOST = 'localhost' # 数据库地址 MYSQL_PORT = 3306 # 数据库端口 MYSQL_USER = 'your_username' # 数据库用户名 MYSQL_PASSWORD = 'your_password' # 数据库密码 MYSQL_DATABASE = 'mydb' # 数据库名称 ``` 然后,在`pipelines.py`中编写代码实现连接数据库、创建表和插入数据: ```python import pymysql class MymoviesPipeline: def __init__(self, mysql_host, mysql_port, mysql_user, mysql_password, mysql_database): self.mysql_host = mysql_host self.mysql_port = mysql_port self.mysql_user = mysql_user self.mysql_password = mysql_password self.mysql_database = mysql_database @classmethod def from_crawler(cls, crawler): return cls( mysql_host=crawler.settings.get('MYSQL_HOST'), mysql_port=crawler.settings.get('MYSQL_PORT'), mysql_user=crawler.settings.get('MYSQL_USER'), mysql_password=crawler.settings.get('MYSQL_PASSWORD'), mysql_database=crawler.settings.get('MYSQL_DATABASE') ) def open_spider(self, spider): self.conn = pymysql.connect( host=self.mysql_host, port=self.mysql_port, user=self.mysql_user, password=self.mysql_password, database=self.mysql_database, charset='utf8mb4' ) self.cur = self.conn.cursor() def close_spider(self, spider): self.cur.close() self.conn.close() def process_item(self, item, spider): sql = ''' CREATE TABLE IF NOT EXISTS mymovies ( id INT(11) NOT NULL AUTO_INCREMENT PRIMARY KEY, name VARCHAR(255) NOT NULL, starts VARCHAR(255) NOT NULL, releasetime VARCHAR(255) NOT NULL, score FLOAT(2, 1) NOT NULL ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; ''' self.cur.execute(sql) sql = ''' INSERT INTO mymovies (name, starts, releasetime, score) VALUES (%s, %s, %s, %s) ''' self.cur.execute(sql, ( item['name'], item['starts'], item['releasetime'], item['score'] )) self.conn.commit() return item ``` 最后,在`items.py`中定义爬取的数据字段: ```python import scrapy class MymoviesItem(scrapy.Item): name = scrapy.Field() starts = scrapy.Field() releasetime = scrapy.Field() score = scrapy.Field() ``` 这样就可以完成爬取猫眼电影榜单TOP100榜的100部电影名字、主演、上映时间、评分等信息,并保存至本地MySQL数据库mydb中了。
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值