这篇博客将详细说明如何从 Java 应用程序调用一个 Python 脚本,并在此过程中传递参数给一个 Scrapy 爬虫。最终目标是让 Java 控制爬虫的行为,如爬取数量和特定的运行参数。
一、Scrapy 爬虫的修改
首先,我们需要确保 Scrapy 爬虫能接收从命令行传递的参数。这涉及到修改 Scrapy 的启动文件和爬虫文件。
-
创建 Python 启动脚本
run.py
:这个脚本负责解析从 Java 传来的命令行参数,并将其传递给 Scrapy 爬虫。
# run.py import sys from scrapy import cmdline class Logger(object): level_relations = { 'debug': logging.DEBUG, 'info': logging.INFO, 'warning': logging.WARNING, 'error': logging.ERROR, 'crit': logging.CRITICAL } def __init__(self, filename, level='info', when='D', backCount=3, fmt='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s'): self.logger = logging.getLogger(filename) format_str = logging.Formatter(fmt) self.logger.setLevel(self.level_relations.get(level)) sh = logging.StreamHandler() sh.setFormatter(format_str) th = handlers.TimedRotatingFileHandler(filename=filename, when=when, backupCount=backCount, encoding='utf-8') th.setFormatter(format_str) self.logger.addHandler(sh) self.logger.addHandler(th) log_all_path = f'./log_info.log' log = Logger(log_all_path, level='debug') if __name__ == '__main__': # 接受从外部传入的参数 # spider_name = sys.argv[1] # 爬虫名称 # token = sys.argv[2].replace("?", " ") # 接受令牌 # num = sys.argv[3] # 接受数字参数 # map_json = sys.argv[4] # 接受JSON格式的字典 # 测试数据 spider_name = 'pic' token = "123" num = 100 map_json = '{"id": 1, "url": "https://pic.netbian.com/tupian/34225.html"}' # 打印数据 log.logger.info(f"Received token: {token}") log.logger.info(f"Received map_json: {map_json}") try: # 确保 map_json 是正确的 JSON 格式 map_data = json.loads(map_json) except json.JSONDecodeError as e: log.logger.error(f"JSON decoding error: {e}, map_json: {map_json}") sys.exit(1) # 构建 Scrapy 命令 command = [ "scrapy", "crawl", spider_name, "-a", f"token={token}", "-a", f"num={num}", "-a", f"map={map_json}" ] cmdline.execute(command)
-
修改 Scrapy 爬虫以使用这些参数:
# pic_spider.py import scrapy import json class PicSpider(scrapy.Spider): """ 彼岸图网站图片爬虫 https://pic.netbian.com/ """ name = 'pic' # allowed_domains = ['netbian.com'] start_urls = ['https://pic.netbian.com//'] def __init__(self, token=None, num=None, map_json=None, *args, **kwargs): super(PicSpider, self).__init__(*args, **kwargs) self.token = token self.num = int(num) if num else 100 # 接收数字并提供默认值 self.map_json = json.loads(map_json) if map_json else {} # 正确的条件检查 def parse(self, resp, **kwargs): url = self.map_json.get('url') print(resp.text)
二、Java 端的实现
在 Java 端,需要构建一个命令行字符串并使用 Runtime.exec()
来调用 Python 脚本。
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;
public class RunScrapyFromJava {
public static void main(String[] args) {
try {
String pythonScriptPath = "path/to/run.py";
String spiderName = "pic";
String token = "some_token";
int num = 100;
Map<String, Object> map = new HashMap<>();
map.put("key1", "value1");
// 将标题中的空格替换成问号, 如果用空格传输会导致python解析失败
map.put("key2", "value2".replace(" ","?"));
String jsonMap = new Gson().toJson(map);
// 对 JSON 字符串进行转义处理,不然python会过滤掉导致转map时失败
json = json.replace("\"", "\\\"");
if (jsonMap.isEmpty()) {
jsonMap = "{}"; // 确保总是传递一个 JSON 字符串
}
// 构建命令
String command = String.format("python %s %s %s %d '%s'",
pythonScriptPath, spiderName, token, num, jsonMap);
Process p = Runtime.getRuntime().exec(command);
p.waitFor();
BufferedReader reader = new BufferedReader(new InputStreamReader(p.getInputStream()));
String line;
while ((line = reader.readLine()) != null) {
System.out.println(line);
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
总结
这个过程实现了从 Java 端通过 Python 启动脚本向 Scrapy 爬虫动态传递参数,使得爬虫行为可以根据外部输入进行调整。此方法适用于需要从 Java 控制 Python 爬虫的场景,如企业应用中的数据采集任务。