from threading import Thread
import requests
from bs4 import BeautifulSoup
from re import sub
import pymysql
def get_html(start):
url = f'https://movie.douban.com/top250?start={start}&filter='
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
}
response = requests.get(url, headers=headers)
analyse_data(response.text)
def analyse_data(data):
conn = pymysql.connect(host='localhost', port=3306,
user='root', password='Abc123!!',
database='hrs', charset='utf8mb4')
soup = BeautifulSoup(data, 'lxml')
all_li = soup.select('.grid_view>li')
for li in all_li:
m_name = li.select_one('.pic>a>img').attrs['alt']
message = li.select_one('.bd>p').text
m_mess = sub(r'\s+', '', message)
m_score = li.select_one('.bd>.star>.rating_num').text
m_com = li.select('.bd>.star>span')[-1].text[0:-3:]
m_rank = li.select_one('.pic>em').text
try:
with conn.cursor() as cursor:
affected_rows = cursor.execute(
'insert into tb_movies values (%s,%s,%s,%s,%s)',
(m_rank, m_name, m_score, m_com, m_mess)
)
if affected_rows == 1:
print('添加电影记录成功')
conn.commit()
except pymysql.MySQLError as err:
conn.rollback()
print('错误提示:', err)
conn.close()
if __name__ == '__main__':
ts = []
for start in range(0, 250, 25):
t = Thread(target=get_html, args=(start,))
t.start()
ts.append(t)
for t in ts:
t.join()
import re
import time
import bs4
import pymysql
import requests
def write_to_db(conn, data):
"""
将数据批量写入数据库二维表
:param conn: 连接对象
:param data: 保存数据的列表
"""
try:
with conn.cursor() as cursor:
cursor.executemany(
'insert into tb_top_movie (mov_title, mov_rank, mov_comment_count) '
'values (%s, %s, %s)',
data
)
conn.commit()
except pymysql.MySQLError as err:
conn.rollback()
print(err)
def fetch_page_data(session, url):
"""
通过指定的URL抓取页面电影数据
:param session: 会话对象
:param url: 统一资源定位符(网址)
:return: 保存电影数据的列表
"""
data = []
pattern = re.compile(r'(?P<cc>\d+)')
resp = session.get(url)
if resp.status_code == 200:
soup = bs4.BeautifulSoup(resp.text, 'html.parser')
title_spans = soup.select('div.info > div.hd > a > span:nth-child(1)')
rank_spans = soup.select('div.info > div.bd > div > span.rating_num')
comment_count_spans = soup.select('div.info > div.bd > div > span:nth-child(4)')
for title_span, rank_span, comment_count_span in \
zip(title_spans, rank_spans, comment_count_spans):
title, rank = title_span.text, rank_span.text
matecher = pattern.match(comment_count_span.text)
comment_count = matecher.group('cc') if matecher else '0'
data.append((title, rank, comment_count))
return data
def main():
"""主函数"""
session = requests.Session()
session.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36'
}
conn = pymysql.connect(host='localhost', port=3306,
user='root', password='Abc123!!',
database='hrs', charset='utf8mb4')
try:
for page in range(10):
url = f'https://movie.douban.com/top250?start={page * 25}&filter='
data = fetch_page_data(session, url)
write_to_db(conn, data)
time.sleep(2)
except Exception as err:
print(err)
finally:
conn.close()
if __name__ == '__main__':
main()
卡牌信息爬取
use `hrs`;
create table `tb_card`
(
`name` varchar(30) not null comment '卡牌名称',
`message` varchar(200) not null comment '卡牌信息',
primary key (`name`)
) engine=innodb comment '游戏王卡牌表';
from threading import Thread
import requests
import pymysql
from lxml import etree
def get_html(start):
url = f'https://www.ourocg.cn/card/list-5/{start}'
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
}
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding
html = etree.HTML(response.text)
all_cards = html.xpath('//div[@class="card-list"]/li')
conn = pymysql.connect(host='localhost', port=3306,
user='root', password='Abc123!!',
database='hrs', charset='utf8mb4')
for card in all_cards:
name = card.xpath('h3/a/text()')[0]
message = card.xpath('h3/text()')[-1]
try:
with conn.cursor() as cursor:
affected_rows = cursor.execute(
'insert into tb_card values (%s, %s)',
(name, message)
)
if affected_rows == 1:
print('添加记录成功')
conn.commit()
except pymysql.MySQLError as err:
conn.rollback()
print('错误提示:', err)
conn.close()
if __name__ == '__main__':
ts = []
for start in range(1, 21):
t = Thread(target=get_html, args=(start,))
t.start()
ts.append(t)
for t in ts:
t.join()
自动编号设置和代码
use `hrs`;
create table `tb_cards`
(`cards_id` int unsigned not null auto_increment comment '编号',
`name` varchar(30) not null comment '卡牌名称',
`message` varchar(200) not null comment '卡牌信息',
primary key (`cards_id`)
) engine=innodb comment '游戏王卡牌表';
with conn.cursor() as cursor:
affected_rows = cursor.execute(
'insert into tb_cards (name, message)'
'values (%s, %s)',
(name, message)
)