思路
首先读取各公司url使用Selenium访问模拟用户浏览器行为,然后使用xpath定位所需元素,最后保存至文件中
问题
同一个ip访问次数过多会被403
解决:配置动态ip代理池
Selenium无法使用有账号密码认证的代理池
解决:Selenium 如何使用代理 IP 进行 Web 爬虫(无认证实现、有账号密码认证实现)-腾讯云开发者社区-腾讯云 (tencent.com)
def create_proxyauth_extension(proxy_host, proxy_port, proxy_username, proxy_password, scheme='http', plugin_path='Selenium-Chrome-HTTP-Private-Proxy.zip'):
"""创建代理认证扩展"""
manifest_json = """
{
"version": "1.0.0",
"manifest_version": 2,
"name": "Chrome Proxy",
"permissions": [
"proxy",
"tabs",
"unlimitedStorage",
"storage",
"<all_urls>",
"webRequest",
"webRequestBlocking"
],
"background": {
"scripts": ["background.js"]
},
"minimum_chrome_version":"22.0.0"
}
"""
background_js = f"""
var config = {{
mode: "fixed_servers",
rules: {{
singleProxy: {{
scheme: "{scheme}",
host: "{proxy_host}",
port: parseInt({proxy_port})
}},
bypassList: ["foobar.com"]
}}
}};
chrome.proxy.settings.set({{value: config, scope: "regular"}}, function() {{}});
function callbackFn(details) {{
return {{
authCredentials: {{
username: "{proxy_username}",
password: "{proxy_password}"
}}
}};
}}
chrome.webRequest.onAuthRequired.addListener(
callbackFn,
{{urls: ["<all_urls>"]}},
['blocking']
);
"""
with zipfile.ZipFile(plugin_path, 'w') as zp:
zp.writestr("manifest.json", manifest_json)
zp.writestr("background.js", background_js)
return plugin_path
def configure_headless_browser(proxy_config):
"""配置并返回一个带代理设置的Chrome浏览器"""
chrome_options = Options()
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
prefs = {"profile.managed_default_content_settings.images": 2} # 禁用图片加载
chrome_options.add_experimental_option("prefs", prefs)
# 创建代理认证扩展
proxyauth_plugin_path = create_proxyauth_extension(
proxy_host=proxy_config[0],
proxy_port=proxy_config[1],
proxy_username=proxy_config[2],
proxy_password=proxy_config[3]
)
chrome_options.add_extension(proxyauth_plugin_path)
chrome_service = Service("./chromedriver.exe")
return webdriver.Chrome(service=chrome_service, options=chrome_options)
具体实现
读取url
def read_urls_from_excel(excel_file, sheet_name='Sheet1', url_column='URL'):
"""从Excel文件中读取URL"""
df = pd.read_excel(excel_file, sheet_name=sheet_name)
return df[url_column].tolist()
创建多线程来遍历url并给各线程配置ip
lock = Lock()
try:
with ThreadPoolExecutor(max_workers=10) as executor:
futures = []
for i, url in enumerate(urls):
proxy_config = proxy_config_list[i // 5 % len(proxy_config_list)] # 使用 % 来避免超出范围
futures.append(executor.submit(load_html_and_save, url, proxy_config, output_excel, lock, api_url, proxy_config_list))
for future in as_completed(futures):
future.result() # 触发异常处理
except Exception as e:
print(f"执行过程中出现错误: {e}")
加载URL的HTML内容并提取所需信息
def load_html(url, proxy_config, proxy_lock, api_url, proxy_config_list):
"""加载URL的HTML内容并提取所需信息"""
driver = configure_headless_browser(proxy_config)
try:
driver.get(url)
time.sleep(15) # 等待15秒加载页面
page_source = driver.page_source
if "403" or "当前 IP 地址可能存在异常访问行为,完成验证后即可正常使用" in page_source:
print(f"{url} 返回403错误,更新代理...")
update_proxy_config(api_url, proxy_config_list, proxy_lock)
return load_html(url, proxy_config_list[0], proxy_lock, api_url, proxy_config_list) # 使用更新后的代理重试
# 使用lxml解析HTML
tree = etree.HTML(page_source)
print(tree.xpath('//h1[@class="name"]/text()'))
data = {
'企业名称': tree.xpath('//h1[@class="name"]/text()'),
'公司名称': tree.xpath("//ul/li[@class='business-detail-name']/text()"),
'法定代表人': tree.xpath("//ul/li[@class='business-detail-user']/text()"),
'成立时间': tree.xpath("//ul/li[@class='business-detail-time']/text()"),
'公司类型': tree.xpath("//ul/li[@class='business-detail-type']/text()"),
'公司状态': tree.xpath("//ul/li[@class='business-detail-status']/text()"),
'注册资本': tree.xpath("//ul/li[@class='business-detail-money']/text()"),
'地址': tree.xpath("//ul/li[@class='business-detail-location']/text()"),
'期限': tree.xpath("//ul/li[@class='business-detail-business-time w-210px']/text()"),
'所属地区': tree.xpath("//ul/li[@class='business-detail-belone-location w-150px']/text()"),
'统一社会信用代码': tree.xpath("//ul/li[@class='business-detail-id']/text()"),
'审核时间': tree.xpath("//ul/li[@class='business-detail-check-time w-210px']/text()"),
'主管机关': tree.xpath("//ul/li[@class='business-detail-orang col-auto']/text()")
}
return data
except Exception as e:
print(f"加载 {url} 时出错: {e}")
return None
finally:
driver.quit()
保存爬取信息
with lock:
df = pd.DataFrame([data])
with pd.ExcelWriter(output_excel, mode='a', if_sheet_exists='overlay', engine='openpyxl') as writer:
df.to_excel(writer, index=False, header=False, startrow=writer.sheets['Sheet1'].max_row)
完整代码
import time
import zipfile
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from lxml import etree
from threading import Lock
def create_proxyauth_extension(proxy_host, proxy_port, proxy_username, proxy_password, scheme='http', plugin_path='Selenium-Chrome-HTTP-Private-Proxy.zip'):
"""创建代理认证扩展"""
manifest_json = """
{
"version": "1.0.0",
"manifest_version": 2,
"name": "Chrome Proxy",
"permissions": [
"proxy",
"tabs",
"unlimitedStorage",
"storage",
"<all_urls>",
"webRequest",
"webRequestBlocking"
],
"background": {
"scripts": ["background.js"]
},
"minimum_chrome_version":"22.0.0"
}
"""
background_js = f"""
var config = {{
mode: "fixed_servers",
rules: {{
singleProxy: {{
scheme: "{scheme}",
host: "{proxy_host}",
port: parseInt({proxy_port})
}},
bypassList: ["foobar.com"]
}}
}};
chrome.proxy.settings.set({{value: config, scope: "regular"}}, function() {{}});
function callbackFn(details) {{
return {{
authCredentials: {{
username: "{proxy_username}",
password: "{proxy_password}"
}}
}};
}}
chrome.webRequest.onAuthRequired.addListener(
callbackFn,
{{urls: ["<all_urls>"]}},
['blocking']
);
"""
with zipfile.ZipFile(plugin_path, 'w') as zp:
zp.writestr("manifest.json", manifest_json)
zp.writestr("background.js", background_js)
return plugin_path
def configure_headless_browser(proxy_config):
"""配置并返回一个带代理设置的Chrome浏览器"""
chrome_options = Options()
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
prefs = {"profile.managed_default_content_settings.images": 2} # 禁用图片加载
chrome_options.add_experimental_option("prefs", prefs)
# 创建代理认证扩展
proxyauth_plugin_path = create_proxyauth_extension(
proxy_host=proxy_config[0],
proxy_port=proxy_config[1],
proxy_username=proxy_config[2],
proxy_password=proxy_config[3]
)
chrome_options.add_extension(proxyauth_plugin_path)
chrome_service = Service("./chromedriver.exe")
return webdriver.Chrome(service=chrome_service, options=chrome_options)
def read_urls_from_excel(excel_file, sheet_name='Sheet1', url_column='URL'):
"""从Excel文件中读取URL"""
df = pd.read_excel(excel_file, sheet_name=sheet_name)
return df[url_column].tolist()
def load_html(url, proxy_config, proxy_lock, api_url, proxy_config_list):
"""加载URL的HTML内容并提取所需信息"""
driver = configure_headless_browser(proxy_config)
try:
driver.get(url)
time.sleep(15) # 等待15秒加载页面
page_source = driver.page_source
if "403" or "当前 IP 地址可能存在异常访问行为,完成验证后即可正常使用" in page_source:
print(f"{url} 返回403错误,更新代理...")
update_proxy_config(api_url, proxy_config_list, proxy_lock)
return load_html(url, proxy_config_list[0], proxy_lock, api_url, proxy_config_list) # 使用更新后的代理重试
# 使用lxml解析HTML
tree = etree.HTML(page_source)
print(tree.xpath('//h1[@class="name"]/text()'))
data = {
'企业名称': tree.xpath('//h1[@class="name"]/text()'),
'公司名称': tree.xpath("//ul/li[@class='business-detail-name']/text()"),
'法定代表人': tree.xpath("//ul/li[@class='business-detail-user']/text()"),
'成立时间': tree.xpath("//ul/li[@class='business-detail-time']/text()"),
'公司类型': tree.xpath("//ul/li[@class='business-detail-type']/text()"),
'公司状态': tree.xpath("//ul/li[@class='business-detail-status']/text()"),
'注册资本': tree.xpath("//ul/li[@class='business-detail-money']/text()"),
'地址': tree.xpath("//ul/li[@class='business-detail-location']/text()"),
'期限': tree.xpath("//ul/li[@class='business-detail-business-time w-210px']/text()"),
'所属地区': tree.xpath("//ul/li[@class='business-detail-belone-location w-150px']/text()"),
'统一社会信用代码': tree.xpath("//ul/li[@class='business-detail-id']/text()"),
'审核时间': tree.xpath("//ul/li[@class='business-detail-check-time w-210px']/text()"),
'主管机关': tree.xpath("//ul/li[@class='business-detail-orang col-auto']/text()")
}
return data
except Exception as e:
print(f"加载 {url} 时出错: {e}")
return None
finally:
driver.quit()
def update_proxy_config(api_url, proxy_config_list, lock):
"""在出现403错误时更新代理配置"""
with lock:
try:
response = requests.get(api_url)
response.raise_for_status()
proxy_data = response.text.strip().split("\n")
for i, proxy in enumerate(proxy_data):
proxy_ip, proxy_port = proxy.split(":")
proxy_config_list[i][:] = [proxy_ip, proxy_port, "d2417743882", "lqaw70bk"]
print(f"更新代理配置: {proxy_config_list}")
except Exception as e:
print(f"更新代理失败: {e}")
def save_to_excel(data, output_excel, lock):
"""将数据保存到Excel文件"""
with lock:
df = pd.DataFrame([data])
with pd.ExcelWriter(output_excel, mode='a', if_sheet_exists='overlay', engine='openpyxl') as writer:
df.to_excel(writer, index=False, header=False, startrow=writer.sheets['Sheet1'].max_row)
def main(input_excel, output_excel, proxy_config_list, api_url):
"""主函数:读取URL,加载HTML,并将数据保存到Excel文件"""
urls = read_urls_from_excel(input_excel)
# 创建一个空的Excel文件,并写入表头
headers = ['企业名称', '公司名称', '法定代表人', '成立时间', '公司类型', '公司状态', '注册资本', '地址', '期限', '所属地区', '统一社会信用代码', '审核时间', '主管机关']
pd.DataFrame(columns=headers).to_excel(output_excel, index=False, engine='openpyxl')
lock = Lock()
try:
with ThreadPoolExecutor(max_workers=10) as executor:
futures = []
for i, url in enumerate(urls):
proxy_config = proxy_config_list[i // 5 % len(proxy_config_list)] # 使用 % 来避免超出范围
futures.append(executor.submit(load_html_and_save, url, proxy_config, output_excel, lock, api_url, proxy_config_list))
for future in as_completed(futures):
future.result() # 触发异常处理
except Exception as e:
print(f"执行过程中出现错误: {e}")
print(f'抓取数据并保存为Excel文件成功: {output_excel}')
def load_html_and_save(url, proxy_config, output_excel, lock, api_url, proxy_config_list):
"""加载URL的HTML内容并提取所需信息,然后保存到Excel文件"""
proxy_lock = Lock()
data = load_html(url, proxy_config, proxy_lock, api_url, proxy_config_list)
if data:
save_to_excel(data, output_excel, lock)
if __name__ == '__main__':
input_excel_file = 'url.xlsx'
output_excel_file = 'output_company_data.xlsx'
proxy_config_list = [
["xxx", "xxx", "xxx", "xxx"],
["xxx", "xxx", "xxx", "xxx"]
]
api_url = "https://xxxxxxxx.com"