本科大数据相关专业,毕设做的是深度学习相关内容。
工作需求一个爬虫,可以在短时间内爬取约1kw的网页,要且只要粗略的文本信息即可。
简单写了一个爬虫,基本能用。爬完大概花了一周左右,差不多一天100w的速度。没用代理,速度不敢调太快。加了代理应该可以再快点。
import csv
import random
import re
import time
from multiprocessing import Pool, cpu_count
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from helper import get_page_info
# 根据自己需求改
g_url = '域名'
g_text = '文本'
test = '文件路径'
# 读取CSV文件并添加列名
df = pd.read_csv(test, header=None, names=[g_url, g_text], dtype = {g_text : object})
# 使用多进程爬取页面信息
def crawl_pages(urls, num_processes=None):
if num_processes is None:
num_processes = max(1, cpu_count() - 1)
with Pool(num_processes) as pool:
results = list(tqdm(pool.imap_unordered(get_page_info, urls), total=len(urls), desc="Crawling pages"))
return results
# 爬取所有页面信息并将结果添加到DataFrame中
if __name__ == "__main__":
results = crawl_pages(df[g_url])
for url, text in results:
matching_rows = df[df[g_url] == url]
if not matching_rows.empty:
index = matching_rows.index[0]
df.at[index, g_text] = text
else:
print(f"No matching URL found in DataFrame for: {url}")
# 将DataFrame写入CSV文件中
df.to_csv('train4.csv', index=False, encoding = 'utf-8')
需要另外import的helper如下
import requests
from bs4 import BeautifulSoup
import re
import random
import time
# 定义请求头,模拟正常浏览器
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Mobile Safari/537.36 Edg/111.0.1661.44",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:85.0) Gecko/20100101 Firefox/85.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0",
]
# 处理包含URL的HTML标记并去除所有HTML标记并清洗文本
def clean_text(text):
soup = BeautifulSoup(text, 'html.parser')
# 处理包含URL的HTML标记
for a in soup.find_all('a'):
if 'href' in a.attrs:
a.attrs['href'] = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9/:?=._-]+', '', a.attrs['href'])
# 去除所有HTML标记并清洗文本
text = soup.get_text().strip()
text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9 ]', '', text)
if not text.strip():
text = 'No text'
return text
# 爬取页面的函数
def get_page_info(url):
try:
# 定义请求头,模拟正常浏览器
headers = {
"User-Agent": random.choice(USER_AGENTS),
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "close",
"Referer": "https://www.baidu.com/",
"Origin": "https://www.baidu.com/",
}
request_url = url
# 如果请求链接不是以http或https开头,则添加http://前缀
if not re.match(r'https?://', request_url):
request_url = 'http://' + request_url
# 使用 Session 对象,它会自动处理重定向
session = requests.Session()
session.headers.update(headers)
requests.adapters.DEFAULT_RETRIES = 5
response = session.get(request_url, timeout=4, proxies={"http": None, "https": None})
final_url = response.url # response.url 会是最终的 URL,即使发生了重定向
# 在0到2秒之间随机延迟
time.sleep(random.uniform(0, 2))
# 不知道网页的编码方式,可以使用bs的自动检测功能。只需要将编码方式设置为None,即可让bs自动检测编码方式
soup = BeautifulSoup(response.content, 'html.parser', from_encoding=None)
text = clean_text(str(soup))
return (url, text)
except Exception as e:
print(f"Error crawling page {url}: {e}")
return (url, 'No text')
目前基本可用。遇到的问题想到再说。以上。