Java网络爬虫crawler4j学习笔记<21> Page 类

最新推荐文章于 2022-06-13 14:57:58 发布

haoshenwang

最新推荐文章于 2022-06-13 14:57:58 发布

阅读量1.1k

点赞数 1

分类专栏：网络爬虫 crawler4j 文章标签：网络爬虫 crawler4j

本文链接：https://blog.csdn.net/wanghao109/article/details/53115464

版权

网络爬虫同时被 2 个专栏收录

26 篇文章 0 订阅

订阅专栏

crawler4j

25 篇文章 0 订阅

订阅专栏

简介

Page 类解析httpClient包中的Entity对象，获取当前页面的信息，包括url(转换为WebURl)，response的信息（status code, response header等），解析后的内容信息等等。

源代码

package edu.uci.ics.crawler4j.crawler;

import java.nio.charset.Charset;

import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.entity.ContentType;
import org.apache.http.util.EntityUtils;

import edu.uci.ics.crawler4j.parser.ParseData;
import edu.uci.ics.crawler4j.url.WebURL;

/**
 * This class contains the data for a fetched and parsed page.
 *
 * @author Yasser Ganjisaffar [lastname at gmail dot com]
 */
// 用来描述web页面的类
public class Page {

  /**
   * The URL of this page.
   */
  // 当前页面的url
  protected WebURL url;

  /**
  * Redirection flag
  */
  // 当前页面是否重定向
  protected boolean redirect;

  /**
   * The URL to which this page will be redirected to
   */
  // 重定向的url
  protected String redirectedToUrl;

  /**
  * Status of the page
  */
  // 当前页面的状态码
  protected int statusCode;

  /**
   * The content of this page in binary format.
   */
  // 二进制格式的页面内容
  protected byte[] contentData;

  /**
   * The ContentType of this page.
   * For example: "text/html; charset=UTF-8"
   */
  // 当前页面的contentType
  protected String contentType;

  /**
   * The encoding of the content.
   * For example: "gzip"
   */
  // 当前页面的编码方式
  protected String contentEncoding;

  /**
   * The charset of the content.
   * For example: "UTF-8"
   */
  // 页面内容的字符集
  protected String contentCharset;

  /**
  * Language of the Content.
  */
  // 页面内容的language
  private String language;

  /**
   * Headers which were present in the response of the fetch request
   */
  // 当前页面response中的header集合
  protected Header[] fetchResponseHeaders;

  /**
   * The parsed data populated by parsers
   */
  // 使用parser翻译过后的页面
  protected ParseData parseData;


  public Page(WebURL url) {
    this.url = url;
  }

  /**
   * Loads the content of this page from a fetched HttpEntity.
   *
   * @param entity HttpEntity
   * @throws Exception when load fails
   */
  // 解析通过httpclient包收到的entity
  public void load(HttpEntity entity) throws Exception {

    contentType = null;
    Header type = entity.getContentType();
    if (type != null) {
      contentType = type.getValue();
    }

    contentEncoding = null;
    Header encoding = entity.getContentEncoding();
    if (encoding != null) {
      contentEncoding = encoding.getValue();
    }

    Charset charset = ContentType.getOrDefault(entity).getCharset();
    if (charset != null) {
      contentCharset = charset.displayName();
    }

    contentData = EntityUtils.toByteArray(entity);
  }

  public WebURL getWebURL() {
    return url;
  }

  public void setWebURL(WebURL url) {
    this.url = url;
  }

  public boolean isRedirect() {
    return redirect;
  }

  public void setRedirect(boolean redirect) {
    this.redirect = redirect;
  }

  public String getRedirectedToUrl() {
    return redirectedToUrl;
  }

  public void setRedirectedToUrl(String redirectedToUrl) {
    this.redirectedToUrl = redirectedToUrl;
  }

  public int getStatusCode() {
    return statusCode;
  }

  public void setStatusCode(int statusCode) {
    this.statusCode = statusCode;
  }

  /**
   * Returns headers which were present in the response of the fetch request
   *
   * @return Header Array, the response headers
   */
  public Header[] getFetchResponseHeaders() {
    return fetchResponseHeaders;
  }

  public void setFetchResponseHeaders(Header[] headers) {
    fetchResponseHeaders = headers;
  }

  /**
   * @return parsed data generated for this page by parsers
   */
  public ParseData getParseData() {
    return parseData;
  }

  public void setParseData(ParseData parseData) {
    this.parseData = parseData;
  }

  /**
   * @return content of this page in binary format.
   */
  public byte[] getContentData() {
    return contentData;
  }

  public void setContentData(byte[] contentData) {
    this.contentData = contentData;
  }

  /**
   * @return ContentType of this page.
   * For example: "text/html; charset=UTF-8"
   */
  public String getContentType() {
    return contentType;
  }

  public void setContentType(String contentType) {
    this.contentType = contentType;
  }

  /**
   * @return encoding of the content.
   * For example: "gzip"
   */
  public String getContentEncoding() {
    return contentEncoding;
  }

  public void setContentEncoding(String contentEncoding) {
    this.contentEncoding = contentEncoding;
  }

  /**
   * @return charset of the content.
   * For example: "UTF-8"
   */
  public String getContentCharset() {
    return contentCharset;
  }

  public void setContentCharset(String contentCharset) {
    this.contentCharset = contentCharset;
  }

  /**
   * @return Language
   */
  public String getLanguage() {
    return language;
  }

  public void setLanguage(String language) {
    this.language = language;
  }
}

haoshenwang

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Java网络爬虫crawler4j学习笔记<21> Page 类

简介Page 类解析httpClient包中的Entity对象，获取当前页面的信息，包括url(转换为WebURl)，response的信息（status code, response header等），解析后的内容信息等等。源代码package edu.uci.ics.crawler4j.crawler;import java.nio.charset.Charset;import org.apac
复制链接

扫一扫