本文使用MySQL数据库,平台用的DBeaver与python进行股吧数据爬取,由于网站的反爬虫机制,本文章添加了随机用户代理与随机延时,但在爬取五六十页后,爬虫页面会变换,基本为方正证卷吧,如果页面变换,过一个小时之后在爬
如下代码为数据库的创建过程:
-- pfyh.comment definition
CREATE TABLE `comment` (
`id` varchar(200) NOT NULL,
`read_count` int DEFAULT NULL,
`reply` int DEFAULT NULL,
`title` varchar(200) DEFAULT NULL,
`title_url` varchar(200) DEFAULT NULL,
`author` varchar(200) DEFAULT NULL,
`author_url` varchar(200) DEFAULT NULL,
`update_time` varchar(200) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
import random
import re
import time
import requests
import pymysql
from parsel import Selector
from colorama import Fore, init
from fake_useragent import UserAgent
# 初始化Colorama用于输出着色
init()
"""
user一般都为root
password填写你自己设置的数据库密码
database为你的数据库
port基本为3306
"""
base_url = 'https://guba.eastmoney.com'
db = pymysql.connect(host='localhost', user='root', password='527630', database='pfyh', port=3306)
def get_random_user_agent():
"""随机获取用户代理"""
try:
ua = UserAgent()
return ua.random
except Exception as e:
print(Fore.RED + f"Error creating UserAgent: {e}")
return 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
def get_headers():
"""构造请求头"""
return {
'User-Agent': get_random_user_agent(),
'Host': 'guba.eastmoney.com'
}
def make_request(url):
"""执行网络请求"""
retries = 3
for _ in range(retries):
try:
print(Fore.BLACK + '-' * 50 + ' 正在获取:' + url)
response = requests.get(url, headers=get_headers(), timeout=10)
if response.status_code == 200:
return response.text
else:
print(Fore.RED + f'获取页面失败,状态码: {response.status_code}')
except requests.RequestException as e:
print(Fore.RED + f'请求错误: {e}')
time.sleep(5) # 等待5秒后重试
return None
def crawl_content(count=1):
""" 获取指定页码的HTML内容 """
url = f'{base_url}/list,600000_{count}.html' if count != 1 else f'{base_url}/list,600000.html'
return make_request(url)
def spider_out_comment(content):
""" 解析HTML内容提取所需数据 """
selector = Selector(text=content)
list_body = selector.xpath('//li[contains(@class, "defaultlist")]/table[contains(@class, "default_list")]/tbody[contains(@class, "listbody")]/tr[contains(@class, "listitem")]')
data = []
for item in list_body:
data.append({
'id': re.findall('[0-9]+', item.css('div.title > a').attrib['href'])[0],
'read_count': item.css('div.read::text').get(),
'reply': item.css('div.reply::text').get(),
'title': item.css('div.title > a::text').get(),
'title_url': item.css('div.title > a').attrib['href'],
'author': item.css('div.author > a::text').get(),
'author_url': item.css('div.author > a').attrib['href'],
'update_time': item.xpath('.//div[contains(@class, "update")]/text()').get()
})
return data
def save_mysql(data):
""" 保存数据到MySQL数据库 """
cursor = db.cursor()
try:
for item in data:
sql = 'INSERT INTO comment (id, read_count, reply, title, title_url, author, author_url, update_time) ' \
'VALUES (%s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE read_count = VALUES(read_count), reply = VALUES(reply)'
cursor.execute(sql, tuple(item.values()))
db.commit()
print(Fore.GREEN + f'成功保存{len(data)}条数据。')
except pymysql.Error as e:
db.rollback()
print(Fore.RED + f'保存数据失败: {e}')
if __name__ == '__main__':
try:
for i in range(700, 1500): # 爬取1500页的数据
content = crawl_content(i)
if content:
data = spider_out_comment(content)
save_mysql(data)
time.sleep(random.uniform(1, 30)) # 随机延时,可以自己修改
finally:
db.close()
print(Fore.GREEN + '数据库连接已关闭。')