代码仅仅是例子,主要是方法的使用,因为是韩国论坛需要代理才能访问。欢迎留言讨论
import threading
import requests
from bs4 import BeautifulSoup
import base64
import sqlite3
import re
import datetime
from multiprocessing.dummy import Pool as ThreadPool
# import _thread
conn = sqlite3.connect('reptile.db') #连接数据库
# 连接到SQLite数据库
c = conn.cursor()
# 目标网站 URL
# url = 'https://www.ppomppu.co.kr/zboard/zboard.php?id=freeboard&hotlist_flag=999'
#爬取整页文章及内容
def getBBS(page):
url = 'https://www.ppomppu.co.kr/zboard/zboard.php?id=freeboard&hotlist_flag=999&page='+str(page)
# 发送请求并解析 HTML
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# 提取文章信息
articles = []
for article in soup.select('.list0,.list1'):
author = article.select_one('.list_name').text.strip()
title = article.select_one('.list_title').text.strip()
uid=article.find_all('td',class_='eng list_vspace')[0].text.strip()
timestamp = article.find_all('td',class_='eng list_vspace')[1].get('title')
content_url ='https://www.ppomppu.co.kr/zboard/'+article.find_all('a')[1].get('href')
content_response = requests.get(content_url)
content_soup = BeautifulSoup(content_response.content, 'html.parser')
date=content_soup.find('div',class_='sub-top-text-box').get_text()
print(date)
pattern = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}"
match = re.search(pattern, date)
if match:
print("匹配到的时间为:", match.group(0))
date=match.group(0)
else:
print("未找到匹配的时间")
# 获取当前时间
now = datetime.datetime.now()
date=now
content_element = content_soup.find_all('table',class_='pic_bg')[2]
if content_element:
content = content_element.text.strip()
images = content_element.find_all('img')
image_data = []
for image in images:
image_url = image['src']
image_response =requests.get('https:'+image_url)
# image_data.append(base64.b64encode(image_response.content).decode())
# 将 img 标签的 src 属性替换成 base64 编码的内容
base64_data = base64.b64encode(image_response.content).decode()
image["src"] = "data:image/png;base64," + base64_data
print(content_element)
# content_element=content_element.replace(image_url,image["src"])
# print(content_element)
# articles.append({
# 'author': author,
# 'title': title,
# 'timestamp': timestamp,
# 'content': content,
# 'images': image_data
# })
print('作者:', author)
print('标题:', title)
print('发布时间:', timestamp)
content=str(content_element)
# 插入一条记录
c.execute("INSERT OR IGNORE INTO getData (dataID,textID,dataName,textUrl,textTitle,textTime,textBody,textState,textName,regTime,EncodingStr) VALUES(:dataID,:textID,:dataName,:textUrl,:title,:textTime,:textBody,:textState,:author,:regTime,:EncodingStr)",
{"dataID":'1',"textID":uid,"dataName":'ppomppu','textUrl':content_url,'title':title,'textTime':date,'textBody':content,'textState':'1','author':author,'regTime':timestamp,'EncodingStr':'EUC-KR'})
# 提交当前事务,保存数据
conn.commit()
print(articles)
# 开始
def startUp():
# urls = ['https://www.ppomppu.co.kr/zboard/zboard.php?id=freeboard&hotlist_flag=999&page={}'.format(i) for i in range(1, 6575)]
# 设置线程数量
num_threads = 10
# 设置要爬取的页数
pages = range(1, 10000)
# 创建线程池
pool = ThreadPool(num_threads)
# 在线程池中运行函数
results = pool.map(getBBS, pages)
# 关闭线程池
pool.close()
pool.join()
if __name__ == '__main__':
startUp()
# 打印文章信息
# def print():
# for article in articles:
# print('作者:', article['author'])
# print('标题:', article['title'])
# print('发布时间:', article['timestamp'])
# print('文章内容:', article['content'])
# print('文章图片:', article['images'])
# print('------------------------')
# 关闭数据库连接
conn.close()