数据库持久化

from threading import Thread
import requests
from bs4 import BeautifulSoup
from re import sub
import pymysql


def get_html(start):
    url = f'https://movie.douban.com/top250?start={start}&filter='
    headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    analyse_data(response.text)


def analyse_data(data):
    conn = pymysql.connect(host='localhost', port=3306,
                           user='root', password='Abc123!!',
                           database='hrs', charset='utf8mb4')
    soup = BeautifulSoup(data, 'lxml')
    all_li = soup.select('.grid_view>li')
    for li in all_li:
        m_name = li.select_one('.pic>a>img').attrs['alt']
        message = li.select_one('.bd>p').text
        m_mess = sub(r'\s+', '', message)
        m_score = li.select_one('.bd>.star>.rating_num').text
        m_com = li.select('.bd>.star>span')[-1].text[0:-3:]
        m_rank = li.select_one('.pic>em').text

        try:
            with conn.cursor() as cursor:
                affected_rows = cursor.execute(
                    'insert into tb_movies values (%s,%s,%s,%s,%s)',
                    (m_rank, m_name, m_score, m_com, m_mess)
                )
                if affected_rows == 1:
                    print('添加电影记录成功')
            conn.commit()
        except pymysql.MySQLError as err:
            conn.rollback()
            print('错误提示:', err)
    conn.close()


if __name__ == '__main__':
    ts = []
    for start in range(0, 250, 25):
        t = Thread(target=get_html, args=(start,))
        t.start()
        ts.append(t)
    for t in ts:
        t.join()

import re
import time
import bs4
import pymysql
import requests


def write_to_db(conn, data):
    """
    将数据批量写入数据库二维表
    :param conn: 连接对象
    :param data: 保存数据的列表
    """
    try:
        with conn.cursor() as cursor:
            cursor.executemany(
                'insert into tb_top_movie (mov_title, mov_rank, mov_comment_count) '
                'values (%s, %s, %s)',
                data
            )
        conn.commit()
    except pymysql.MySQLError as err:
        conn.rollback()
        print(err)


def fetch_page_data(session, url):
    """
    通过指定的URL抓取页面电影数据
    :param session: 会话对象
    :param url: 统一资源定位符(网址)
    :return: 保存电影数据的列表
    """
    data = []
    pattern = re.compile(r'(?P<cc>\d+)')
    resp = session.get(url)
    if resp.status_code == 200:
        soup = bs4.BeautifulSoup(resp.text, 'html.parser')
        title_spans = soup.select('div.info > div.hd > a > span:nth-child(1)')
        rank_spans = soup.select('div.info > div.bd > div > span.rating_num')
        comment_count_spans = soup.select('div.info > div.bd > div > span:nth-child(4)')
        for title_span, rank_span, comment_count_span in \
                zip(title_spans, rank_spans, comment_count_spans):
            title, rank = title_span.text, rank_span.text
            matecher = pattern.match(comment_count_span.text)
            comment_count = matecher.group('cc') if matecher else '0'
            data.append((title, rank, comment_count))
    return data


def main():
    """主函数"""
    session = requests.Session()
    session.headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36'
    }
    conn = pymysql.connect(host='localhost', port=3306,
                           user='root', password='Abc123!!',
                           database='hrs', charset='utf8mb4')
    try:
        for page in range(10):
            url = f'https://movie.douban.com/top250?start={page * 25}&filter='
            data = fetch_page_data(session, url)
            write_to_db(conn, data)
            time.sleep(2)
    except Exception as err:
        print(err)
    finally:
        conn.close()


if __name__ == '__main__':
    main()

卡牌信息爬取

use `hrs`;
create table `tb_card`
(
`name` varchar(30) not null comment '卡牌名称',
`message` varchar(200) not null comment '卡牌信息',
primary key (`name`)
) engine=innodb comment '游戏王卡牌表';
from threading import Thread
import requests
import pymysql
from lxml import etree


def get_html(start):
    url = f'https://www.ourocg.cn/card/list-5/{start}'
    headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    response.encoding = response.apparent_encoding
    html = etree.HTML(response.text)
    all_cards = html.xpath('//div[@class="card-list"]/li')
    conn = pymysql.connect(host='localhost', port=3306,
                           user='root', password='Abc123!!',
                           database='hrs', charset='utf8mb4')
    for card in all_cards:
        name = card.xpath('h3/a/text()')[0]
        message = card.xpath('h3/text()')[-1]

        try:
            with conn.cursor() as cursor:
                affected_rows = cursor.execute(
                    'insert into tb_card values (%s, %s)',
                    (name, message)
                )
                if affected_rows == 1:
                    print('添加记录成功')
            conn.commit()
        except pymysql.MySQLError as err:
            conn.rollback()
            print('错误提示:', err)
    conn.close()


if __name__ == '__main__':
    ts = []
    for start in range(1, 21):
        t = Thread(target=get_html, args=(start,))
        t.start()
        ts.append(t)
    for t in ts:
        t.join()

自动编号设置和代码

use `hrs`;
create table `tb_cards`
(`cards_id`  int unsigned not null auto_increment comment '编号',
`name` varchar(30) not null comment '卡牌名称',
`message` varchar(200) not null comment '卡牌信息',
primary key (`cards_id`)
) engine=innodb comment '游戏王卡牌表';
 with conn.cursor() as cursor:
                affected_rows = cursor.execute(
                    'insert into tb_cards (name, message)'
                    'values (%s, %s)',
                    (name, message)
                )
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值