go get xxx timeout

本文介绍了解决Golang爬虫包PuerkitoBio/goquery安装过程中遇到的问题的方法。当遇到因网络连接问题导致的超时错误时,可以通过手动克隆相关依赖库到本地GOPATH目录下的方式来规避该问题。

问题描述

想用golang做一爬虫小项目,安装golang包 PuerkitoBio/goquery 时发现被墙了超时

$ go get github.com/PuerkitoBio/goquery
package golang.org/x/net/html: unrecognized import path "golang.org/x/net/html" (https fetch: Get https://golang.org/x/net/html?go-get=1: dial tcp 216.239.37.1:443: i/o timeout)

解决

1、安装golang.org/x/net

golang的第三方包都会安装GOPATH目录下

$ mkdir -p $GOPATH/src/golang.org/x/
$ git clone https://github.com/golang/net.git $GOPATH/src/golang.org/x/net
$ go install net
2、安装PuerkitoBio/goquery
$ go get github.com/PuerkitoBio/goquery
import base64 import json import logging import os import platform import re import shutil import threading import time import psutil import subprocess from datetime import datetime from urllib.parse import urlencode, urlparse, urljoin import scrapy from bs4 import BeautifulSoup from scrapy.settings import BaseSettings from scrapy_redis.spiders import RedisSpider from scrapy_redis.utils import bytes_to_str from playwright.sync_api import sync_playwright, Error as PlaywrightError from urls_spider.log_handler.http_log_handler import HttpLogHandler from urls_spider.utils.DBUtils import DBUtils from urls_spider.utils.DataApiUtils import DataApiUtils """ 一个基于 Scrapy-Redis 的爬虫: - 每个任务只处理一个 URL - 打开浏览器(可支持代理) - 开启抓包,加载 URL - 保存页面截图、MHTML、trace.zip、HAR、pcap - 关闭抓包,关闭浏览器 抓包环境: 在Windows环境下需要安装Wireshark,在环境变量中添加tshark的路径 在Linux环境下需要安装tcpdump """ class PlaywrightRedisSpider(RedisSpider): name = "playwright_redis_spider" # allowed_domains = ["archive.org"] # start_urls = ["https://archive.org"] redis_key = None task_redis_key = None idle_max_time = 100 # redis队列最大空闲时长(秒 可选参数 idle_check_interval = 10 # redis队列检查空闲的频率(秒 可选参数 save_cache_size = 1000 # 缓存阈值,达到这个数量就立即存储 可选参数 save_flush_interval = 60 # 定时刷新缓存的时间间隔(单位:秒) 可选参数 BASE_DIR = "captures" # 全局浏览器实例和 Playwright 对象 playwright_browser = None _playwright = None custom_settings = { # 'COOKIES_ENABLED': False, 'DOWNLOAD_DELAY': 1, 'CONCURRENT_REQUESTS': 1, # 限制最大并发请求数, Redis调度器每次只分发个请求 # 'REDIS_START_URLS_BATCH_SIZE': 1, # 限制 Redis 调度器每次只分发 1 个请求 # 'CONCURRENT_REQUESTS_PER_DOMAIN': 1, # 限制每个域名的最大请求数 # 'SCHEDULER_IDLE_BEFORE_CLOSE': 10, # redis队列空闲10秒后关闭爬虫 'ITEM_PIPELINES': { # 'DataSpider.pipelines.MyImagesPipeline': 200, # 'DataSpider.pipelines.MyVideoPipeline': 300, # 'DataSpider.pipelines.WebsitesPagePipeline': 400, }, # 'DOWNLOADER_MIDDLEWARES': { # 'DataSpider.middlewares.SpiderTaskLogMiddleware': 543, # }, 'EXTENSIONS': { # 'DataSpider.extensions.RedisIdleMonitorExtension': 500, # 'DataSpider.extensions.HttpLogSenderExtension': 510, }, } @classmethod def from_crawler(cls, crawler, *args, **kwargs): # 0. 进入 from_crawler spider = super().from_crawler(crawler, *args, **kwargs) # 1. 执行 __init__ # 2. __init__ 已执行完 settings = crawler.settings # 从settings获取接口地址 endpoint_url = settings.get("LOG_POST_API") spider.log_id = kwargs.get('log_id') if endpoint_url: if not spider.log_id: raise ValueError(f"已传递日志服务地址:{endpoint_url},必须通过 -a log_id=xxx 指定 日志ID") http_handler = HttpLogHandler(endpoint_url=endpoint_url, id_str=spider.log_id) if not http_handler.is_available(): raise RuntimeError(f"⛔ 日志接口不可用,终止爬虫启动:{endpoint_url}") # 接口可用,注册 handler http_handler.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(name)s - %(message)s') http_handler.setFormatter(formatter) logging.getLogger().addHandler(http_handler) spider.logger.info(f"✅ 接口可用,已启用日志上传:{endpoint_url}") else: spider.logger.info("ℹ️ 未配置 LOG_POST_API 或 log_id,跳过日志上传") # ✅ 打印所有 Scrapy settings 参数(展开嵌套结构) spider.logger.info("[from_crawler] 当前 Scrapy settings(已展开):") for key in sorted(settings.keys()): value = settings.get(key) # 若值是 BaseSettings 实例(如 DOWNLOADER_MIDDLEWARES、EXTENSIONS 等) if isinstance(value, BaseSettings): value = value.copy_to_dict() if isinstance(value, dict): pretty = json.dumps(value, indent=2, ensure_ascii=False) spider.logger.info(f" - {key}:\n{pretty}") else: spider.logger.info(f" - {key}: {value}") # 数据存储接口 api_bash_url = settings.get('API_BASH_URL') if not api_bash_url: raise ValueError("API_BASH_URL 未设置,请在 settings.py 中配置") spider.logger.info(f' - 数据存储接口地址:{api_bash_url}') spider.data_api_utils = DataApiUtils(api_bash_url=api_bash_url) # ✅ 打印通过 -a 传入的参数 spider.logger.info("[from_crawler] 收到的启动参数:") for k, v in kwargs.items(): spider.logger.info(f" - {k} = {v}") # 参数解析(强烈建议设置默认值) spider.redis_key = kwargs.get('redis_key') if not spider.redis_key: raise ValueError("必须通过 -a redis_key=xxx 指定 Redis Key") # 和爬虫数据库交互 mysql_uri = settings['MYSQL_URI'] spider.db_utils = DBUtils(mysql_uri) return spider def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.data_api_utils = None self.db_utils = None # ✅ 在爬虫实例化时检查抓包环境 self.logger.error("检查抓包环境") self.system = platform.system().lower() if self.system == "windows": if not shutil.which("tshark"): raise RuntimeError("⛔ Windows 环境未检测到 tshark,请先安装 Wireshark 并配置环境变量!") self.capture_cmd = "tshark" elif self.system == "linux": if not shutil.which("tcpdump"): raise RuntimeError("⛔ Linux 环境未检测到 tcpdump,请先安装 tcpdump!") self.capture_cmd = "tcpdump" else: raise RuntimeError(f"⛔ 不支持的系统: {self.system}") self.logger.info(f"✅ 抓包环境检测通过:系统={self.system}, 使用工具={self.capture_cmd}") # 自动选择默认网卡,只执行一次 self.default_iface = self.get_default_interface() self.default_iface = '5' self.logger.info(f"✅ 默认抓包网卡: {self.default_iface}") def make_request_from_data(self, data): """ 从 Redis 中读取 JSON 数据,创建请求 """ json_str = bytes_to_str(data, self.redis_encoding) try: task = json.loads(json_str) if isinstance(task, str): task = json.loads(task) except json.JSONDecodeError: self.logger.error(f"不是有效的 JSON:{json_str}") return None self.logger.info(f'接收到任务:{task}') base_url = task.get("url") url_level = task.get('url_level') go_into_max_depth = task.get('go_into_max_depth') if not base_url: self.logger.warning(f"[参数错误] 缺少 url :{task}") return None if url_level is not None and go_into_max_depth is not None: url_level_int = int(url_level) go_into_max_depth_int = int(go_into_max_depth) if url_level_int <= go_into_max_depth_int: print('开始') self.logger.info(f'开始浏览器加载链接任务url:{base_url}') return scrapy.Request( url=base_url, callback=self.get_page, meta={ "task": task, 'handle_httpstatus_all': True # 允许所有状态码 }, dont_filter=True) return None def closed(self, reason): self.logger.info(f"爬虫已关闭,原因: {reason}") # ================= 主流程 ================= def get_page(self, response, max_retry=3, timeout=60): task = response.meta.get("task", {}) url = task.get("url", response.url) actions = task.get("actions", {}) save_dir = self.get_save_dir(url) attempt = 0 while attempt < max_retry: attempt += 1 context = None page = None proc = None try: self.logger.info(f"[尝试 {attempt}] 开始任务 {url}") proc, _ = self.start_capture(save_dir) browser = self.open_browser(restart_on_fail=(attempt > 1)) context = browser.new_context(record_har_path=os.path.join(save_dir, f"network_{attempt}.har")) context.tracing.start(screenshots=True, snapshots=True, sources=True) page = context.new_page() # 页面访问 page.goto(url, timeout=timeout * 1000) # 执行动作 for act in actions: self.execute_actions(page, [act]) # 保存截图、MHTML、HTML self.capture_full_page(page, os.path.join(save_dir, f"page_{attempt}.png")) session = context.new_cdp_session(page) mhtml_content = session.send("Page.captureSnapshot", {"format": "mhtml"})["data"] with open(os.path.join(save_dir, f"page_{attempt}.mhtml"), "w", encoding="utf-8") as f: f.write(mhtml_content) html_content = page.content() with open(os.path.join(save_dir, f"page_{attempt}.html"), "w", encoding="utf-8") as f: f.write(html_content) # 解析HTML # self.parse_html(html_content, task) # 停止 trace context.tracing.stop(path=os.path.join(save_dir, f"trace_{attempt}.zip")) self.logger.info(f"[完成] {url}") break except Exception as e: self.logger.warning(f"[尝试 {attempt}] 异常: {e}") self.close_browser() # 浏览器重启 time.sleep(2) finally: # 主线程安全关闭 try: if page: page.close() if context: context.close() if proc: self.stop_capture(proc) except Exception as e2: self.logger.warning(f"[安全关闭] 异常: {e2}") def execute_actions(self, page, actions): """ 执行一系列浏览器操作 :param page: Playwright 页面对象 :param actions: 动作列表 """ for idx, act in enumerate(actions): try: action_type = act.get("action") if action_type == "click": selector = act["selector"] page.locator(selector).click() self.logger.info(f"[操作 {idx + 1}] 点击元素: {selector}") # 等待网络空闲或 DOM稳定 page.wait_for_load_state("networkidle") elif action_type == "fill": selector = act["selector"] value = act.get("value", "") page.locator(selector).fill(value) self.logger.info(f"[操作 {idx + 1}] 填写输入框 {selector} -> {value}") # 等待网络空闲或 DOM稳定 page.wait_for_load_state("networkidle") elif action_type == "wait": timeout = act.get("timeout", 1000) page.wait_for_timeout(timeout) self.logger.info(f"[操作 {idx + 1}] 等待 {timeout} ms") elif action_type == "scroll": pos = act.get("position", "bottom") if pos == "bottom": page.evaluate("window.scrollTo(0, document.body.scrollHeight);") elif pos == "top": page.evaluate("window.scrollTo(0, 0);") self.logger.info(f"[操作 {idx + 1}] 滚动到 {pos}") # 等待网络空闲或 DOM稳定 page.wait_for_load_state("networkidle") elif action_type == "navigate_next": selector = act.get("selector") max_pages = act.get("max_pages", 5) for i in range(max_pages): if page.locator(selector).count() > 0: page.locator(selector).click() page.wait_for_timeout(1000) self.logger.info(f"[操作 {idx + 1}] 翻页 {i + 1}") else: self.logger.info(f"[操作 {idx + 1}] 没有找到下一页按钮") break else: self.logger.warning(f"[操作 {idx + 1}] 未知操作类型: {action_type}") except Exception as e: self.logger.warning(f"[操作 {idx + 1}] 执行失败: {e}") def capture_full_page(self, page, save_dir): client = page.context.new_cdp_session(page) # 获取页面宽高 dimensions = page.evaluate(""" () => { return { width: document.documentElement.scrollWidth, height: document.documentElement.scrollHeight, deviceScaleFactor: window.devicePixelRatio } } """) # 临时设置 viewport 尺寸为整个页面大小 client.send("Emulation.setDeviceMetricsOverride", { "mobile": False, "width": dimensions["width"], "height": dimensions["height"], "deviceScaleFactor": dimensions["deviceScaleFactor"], }) # 截图 result = client.send("Page.captureScreenshot", {"format": "png", "fromSurface": True}) # 保存文件 path = save_dir with open(path, "wb") as f: f.write(base64.b64decode(result["data"])) # print(f"✅ 长截图已保存: {path}") # 还原 viewport 设置 client.send("Emulation.clearDeviceMetricsOverride") return path def parse_html(self, html, task): """ 解析 HTML,提取所有 <a> 标签的链接和文本 :param html: HTML 字符串 :return: 一个列表,每个元素是 (链接, 文本) 的元组 """ url_level = task.get('url_level') go_into_max_depth_int = task.get('go_into_max_depth') url = task.get('url') soup = BeautifulSoup(html, 'html.parser') seeds = [] for a_tag in soup.find_all('a', href=True): raw_href = a_tag['href'].strip() # 跳过非网页链接 if raw_href.startswith("#") or raw_href.lower().startswith(("mailto:", "javascript:")): continue # 检查是否已经是完整链接 parsed = urlparse(raw_href) if parsed.scheme and parsed.netloc: full_href = raw_href # 已经是完整链接 else: full_href = urljoin(url, raw_href) # 补全相对链接 text = a_tag.get_text(strip=True) seed = { 'url': full_href, 'text': text, "url_level": int(url_level) + 1, 'go_into_max_depth': go_into_max_depth_int, } seeds.append(seed) task_json = { "redisKey": self.task_redis_key, "seeds": seeds } self.data_api_utils.add_schedule_request(task_json) self.logger.info(f'任务url:{url},url_level:{url_level},发现子链接数:{len(seeds)},上传到任务队列') # ================= 工具方法 ================= def safe_filename(self, url: str, max_length: int = 50) -> str: """ 把 URL 转换为安全的文件名,并限制长度 :param url: 原始 URL :param max_length: 文件名最大长度 :return: 安全文件名 """ # 替换非法字符为下划线 filename = re.sub(r"[^a-zA-Z0-9]+", "_", url) # 去掉开头和结尾多余下划线 filename = filename.strip("_") # 限制长度 if len(filename) > max_length: filename = filename[:max_length] return filename def get_save_dir(self, url: str) -> str: """生成保存目录""" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") folder_name = f"{self.safe_filename(url)}_{timestamp}" save_dir = os.path.join(self.BASE_DIR, folder_name) os.makedirs(save_dir, exist_ok=True) return save_dir # ------------------- 自动选择网卡 ------------------- def get_default_interface(self): """自动选择抓包网卡,保证能抓到 Chromium 流量""" if self.system == "windows": addrs = psutil.net_if_addrs() # 首选非 loopback 的有 IPv4 地址网卡 for iface, addr_list in addrs.items(): for addr in addr_list: if addr.family == 2 and not iface.lower().startswith("loopback"): self.logger.info(f"[抓包] 自动选择 Windows 网卡: {iface}") return iface # fallback 使用第一个网卡 iface = list(addrs.keys())[0] self.logger.warning(f"[抓包] 使用 fallback 网卡: {iface}") return iface elif self.system == "linux": # 尝试获取默认路由网卡 try: result = subprocess.run( ["ip", "route", "get", "8.8.8.8"], capture_output=True, text=True ) match = re.search(r"dev (\S+)", result.stdout) if match: iface = match.group(1) self.logger.info(f"[抓包] 自动选择 Linux 网卡: {iface}") return iface except Exception as e: self.logger.warning(f"[抓包] 获取默认网卡失败: {e}") # fallback 抓所有网卡 self.logger.info("[抓包] fallback 使用 any 网卡") return "any" else: raise RuntimeError(f"不支持的系统: {self.system}") # ------------------- 抓包方法 ------------------- def start_capture(self, save_dir: str, filter_expr: str = None): """ 开始抓包,保证 Playwright Chromium 流量能被捕获 :param save_dir: 保存目录 :param filter_expr: tcp/udp 过滤表达式,可选 :return: (subprocess.Popen, pcap_path) """ pcap_path = os.path.join(save_dir, "session.pcap") iface = self.default_iface if self.system == "windows": # tshark 实时写入,加 -l cmd = [self.capture_cmd, "-i", iface, "-w", pcap_path, "-l"] if filter_expr: cmd += ["-f", filter_expr] elif self.system == "linux": # tcpdump 抓所有网卡或默认网卡 cmd = ["sudo", self.capture_cmd, "-i", iface, "-w", pcap_path] if filter_expr: cmd += [filter_expr] else: raise RuntimeError(f"不支持的系统: {self.system}") self.logger.info(f"[抓包] 启动命令: {cmd}") try: proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except Exception as e: raise RuntimeError(f"启动抓包失败: {e}") return proc, pcap_path def stop_capture(self, proc): """停止抓包进程""" if not proc: return self.logger.info("[抓包] 停止抓包进程") proc.terminate() try: proc.wait(timeout=5) except subprocess.TimeoutExpired: self.logger.warning("[抓包] 进程未及时退出,强制杀掉") proc.kill() proc.wait() def open_browser(self, restart_on_fail=False): if not self.playwright_browser or restart_on_fail: if self.playwright_browser: try: self.close_browser() except: pass self._playwright = sync_playwright().start() self.playwright_browser = self._playwright.chromium.launch(headless=False) self.logger.info("Playwright 全局浏览器启动成功") return self.playwright_browser def close_browser(self): """关闭全局浏览器实例""" if self.playwright_browser: self.playwright_browser.close() self._playwright.stop() self.logger.info("✅ Playwright 全局浏览器关闭") self.playwright_browser = None self._playwright = None
最新发布
10-12
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值