【Webmagic】自定义 Download

最新推荐文章于 2023-12-07 20:33:41 发布

weixin_34368949

最新推荐文章于 2023-12-07 20:33:41 发布

阅读量811

点赞数

文章标签： python

原文链接：https://my.oschina.net/whitejavadog/blog/2885492

版权

2019独角兽企业重金招聘Python工程师标准>>>

webmagic 文档地址： http://webmagic.io/docs/zh/ webmagic github 地址： https://github.com/code4craft/webmagic

注意：目前使用过 0.6.1 版本和 0.7.x 版本

pom依赖如下：

<dependency>
	<groupId>us.codecraft</groupId>
	<artifactId>webmagic-core</artifactId>
	<version>0.6.1</version>
</dependency>
<dependency>
	<groupId>us.codecraft</groupId>
	<artifactId>webmagic-extension</artifactId>
	<version>0.6.1</version>
</dependency>

模拟登录其中 0.6.1 版本对于 post 方法的支持不太好，如果需要模拟登录，可以采用 httpclient 登录后使用 cookieStore 获取cookie，在通过 webmagic 的 site 设置。 0.7.x 版本好像对这种设置cookie的方式不太兼容，不知道是不是用的方式不对，还是其版本bug。示例代码：

public class SpiderLogin{
	public void login(url) {
		HttpGet httpGet = new HttpGet(url);
		CookieStore cookieStore = new BasicCookieStore();
		CloseableHttpClient httpClient = HttpClients.createDefault();
		CloseableHttpResponse response = httpClient.execute(httpGet);
            String result = EntityUtils.toString(response.getEntity(), "utf-8");
		if(...){
			// 登录成功的逻辑
			List<Cookie> cookies = cookiestore.getCookies();  // 从cookiesstore 中获取cookie
			this.saveCookie(cookies); // 遍历cookie保存到cookieMap中
		}
	}
	public void saveCookie(List<Cookie> cookies) {
		for (int i = 0; i < cookies.size(); i++) {
			if (cookies.get(i).getName().equals("PHPSESSID")) {
				cookieMap.put("PHPSESSID", cookies.get(i).getValue());
			}
		}
	}
}

Spider 示例代码：

public class JDParser implements PageProcessor {
	// 抓取网页时设置cookies
	public Site getSite() {
		Iterator iter = SpiderLogin.cookieMap.entrySet().iterator();
		while (iter.hasNext()) {
			Map.Entry entry = (Map.Entry) iter.next();
			Object key = entry.getKey();
			Object val = entry.getValue();
			site.addCookie(key.toString(), val.toString());
		}
		return site.addHeader("User-Agent",
		"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1");
    }
	public void process(Page page){
		// 处理页面的逻辑
	}
}

设置代理 0.6.x 版本对代理设置的兼容性不太好，没有用成功，github 上有人提到，https://github.com/code4craft/webmagic/pull/290 说是 0.7.0 是稳定版本，更改版本按照文档编写后，依然无法使用代理，所以只能重写 Download。

代码如下：

public class ProxyDownloader implements Downloader, Closeable {
    private static Logger logger = Logger.getLogger(ProxyDownloader.class);
    CloseableHttpClient httpClient;

    public ProxyDownloader() {

    }

    @Override
    public void close() throws IOException {
        httpClient.close();
    }

    /**
     * 开始下载（核心）
     */
    @Override
    public Page download(Request request, Task task) {
        Site site = null;
        if (task != null) {
            site = task.getSite();
        }
        Set<Integer> acceptStatCode;
        String charset = null;
        Map<String, String> headers = null;
        if (site != null) {
            acceptStatCode = site.getAcceptStatCode();
            charset = site.getCharset();
            headers = site.getHeaders();
        } else {
            acceptStatCode = WMCollections.newHashSet(200);
        }
        int statusCode = 0;

        logger.info("downloading page " + request.getUrl());
        httpClient = getHttpClientWithProxy(SpiderLogin.cookieStore);
        HttpGet httpGet = new HttpGet(request.getUrl());
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"
                + " (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36");
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpGet);
            statusCode = response.getStatusLine().getStatusCode();
            request.putExtra(Request.STATUS_CODE, statusCode);
            if (statusAccept(acceptStatCode, statusCode)) {
                Page page = handleResponse(request, charset, response, task);
                onSuccess(request);
                return page;
            } else {
                logger.warn("get page " + request.getUrl() + " error, status code " + statusCode);
                return null;
            }
        } catch (IOException e1) {
            logger.warn("download page " + request.getUrl() + " error " + e1.getMessage());
            if (site.getCycleRetryTimes() > 0) {
                return addToCycleRetry(request, site);
            }
            onError(request);
            logger.info(e1.getMessage());
        }
        return null;
    }

    /**
     * 网页下载成功回调
     * 
     * @param request
     */
    protected void onSuccess(Request request) {
    }

    /**
     * 网页下载失败回调
     * 
     * @param request
     */
    protected void onError(Request request) {
    }

    /**
     * 处理请求结果 并封装成 page
     * 
     * @param request
     * @param charset
     * @param httpResponse
     * @param task
     * @return
     * @throws IOException
     */
    protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task)
            throws IOException {
        String content = getContent(charset, httpResponse);
        Page page = new Page();
        page.setRawText(content);
        page.setUrl(new PlainText(request.getUrl()));
        page.setRequest(request);
        page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
        return page;
    }

    /**
     * 根据编码处理网页
     * 
     * @param charset
     * @param httpResponse
     * @return
     * @throws IOException
     */
    protected String getContent(String charset, HttpResponse httpResponse) throws IOException {
        if (charset == null) {
            byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
            String htmlCharset = getHtmlCharset(httpResponse, contentBytes);
            if (htmlCharset != null) {
                return new String(contentBytes, htmlCharset);
            } else {
                logger.warn("Charset autodetect failed, use " + Charset.defaultCharset()
                        + " as charset. Please specify charset in Site.setCharset()");
                return new String(contentBytes);
            }
        } else {
            return IOUtils.toString(httpResponse.getEntity().getContent(), charset);
        }
    }

    /**
     * 从网页中获取网页编码 并对网页进行编码
     * 
     * @param httpResponse
     * @param contentBytes
     * @return
     * @throws IOException
     */
    protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
        String charset;
        // charset
        String value = httpResponse.getEntity().getContentType().getValue();
        charset = UrlUtils.getCharset(value);
        if (StringUtils.isNotBlank(charset)) {
            logger.debug("Auto get charset:" + charset);
            return charset;
        }
        Charset defaultCharset = Charset.defaultCharset();
        String content = new String(contentBytes, defaultCharset.name());
        if (StringUtils.isNotEmpty(content)) {
            Document document = Jsoup.parse(content);
            Elements links = document.select("meta");
            for (Element link : links) {
                String metaContent = link.attr("content");
                String metaCharset = link.attr("charset");
                if (metaContent.indexOf("charset") != -1) {
                    metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
                    charset = metaContent.split("=")[1];
                    break;
                }
                // 2.2、html5 <meta charset="UTF-8" />
                else if (StringUtils.isNotEmpty(metaCharset)) {
                    charset = metaCharset;
                    break;
                }
            }
        }
        logger.debug("Auto get charset: " + charset);
        return charset;
    }

    /**
     * 失败重试机制
     * 
     * @param request
     * @param site
     * @return
     */
    protected Page addToCycleRetry(Request request, Site site) {
        Page page = new Page();
        Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES);
        if (cycleTriedTimesObject == null) {
            page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
        } else {
            int cycleTriedTimes = (Integer) cycleTriedTimesObject;
            cycleTriedTimes++;
            if (cycleTriedTimes >= site.getCycleRetryTimes()) {
                return null;
            }
            page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, cycleTriedTimes));
        }
        page.setNeedCycleRetry(true);
        return page;
    }

    /**
     * 设置可接受的 code 码
     * 
     * @param acceptStatCode
     * @param statusCode
     * @return
     */
    protected boolean statusAccept(Set<Integer> acceptStatCode, int statusCode) {
        return acceptStatCode.contains(statusCode);
    }

    @Override
    public void setThread(int threadNum) {

    }
	// 设置代理
	public static CloseableHttpClient getHttpClientWithProxy(CookieStore cookieStore) {
		HttpClientBuilder builder = HttpClientBuilder.create();
		if (cookieStore != null) {
		builder.setDefaultCookieStore(cookieStore);
		}
		builder.setProxy(new HttpHost(host, port, "http"));
		BasicCredentialsProvider credsProvider = new BasicCredentialsProvider();
		credsProvider.setCredentials(AuthScope.ANY, new UsernamePasswordCredentials(username, password));
		builder.setDefaultCredentialsProvider(credsProvider);
		return builder.build();
	}
}

用 Spider 时就可以使用

	Spider.create(new JDParser()).addUrl(url).setDownloader(new ProxyDownloader()).run();

还有一点，0.6.x 版本在处理相对路径的url时，会帮你把域名加上去，但是 0.7.x 没有加上去，不知道是没有提供这个方法还是我没有找到。

转载于:https://my.oschina.net/whitejavadog/blog/2885492

weixin_34368949

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
【Webmagic】自定义 Download

2019独角兽企业重金招聘Python工程师标准>>> ...
复制链接

扫一扫