Java网络爬虫crawler4j学习笔记<21> Page 类

简介

Page 类解析httpClient包中的Entity对象,获取当前页面的信息,包括url(转换为WebURl),response的信息(status code, response header等),解析后的内容信息等等。

源代码

package edu.uci.ics.crawler4j.crawler;

import java.nio.charset.Charset;

import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.entity.ContentType;
import org.apache.http.util.EntityUtils;

import edu.uci.ics.crawler4j.parser.ParseData;
import edu.uci.ics.crawler4j.url.WebURL;

/**
 * This class contains the data for a fetched and parsed page.
 *
 * @author Yasser Ganjisaffar [lastname at gmail dot com]
 */
// 用来描述web页面的类
public class Page {

  /**
   * The URL of this page.
   */
  // 当前页面的url
  protected WebURL url;

  /**
  * Redirection flag
  */
  // 当前页面是否重定向
  protected boolean redirect;

  /**
   * The URL to which this page will be redirected to
   */
  // 重定向的url
  protected String redirectedToUrl;

  /**
  * Status of the page
  */
  // 当前页面的状态码
  protected int statusCode;

  /**
   * The content of this page in binary format.
   */
  // 二进制格式的页面内容
  protected byte[] contentData;

  /**
   * The ContentType of this page.
   * For example: "text/html; charset=UTF-8"
   */
  // 当前页面的contentType
  protected String contentType;

  /**
   * The encoding of the content.
   * For example: "gzip"
   */
  // 当前页面的编码方式
  protected String contentEncoding;

  /**
   * The charset of the content.
   * For example: "UTF-8"
   */
  // 页面内容的字符集
  protected String contentCharset;

  /**
  * Language of the Content.
  */
  // 页面内容的language
  private String language;

  /**
   * Headers which were present in the response of the fetch request
   */
  // 当前页面response中的header集合
  protected Header[] fetchResponseHeaders;

  /**
   * The parsed data populated by parsers
   */
  // 使用parser翻译过后的页面
  protected ParseData parseData;


  public Page(WebURL url) {
    this.url = url;
  }

  /**
   * Loads the content of this page from a fetched HttpEntity.
   *
   * @param entity HttpEntity
   * @throws Exception when load fails
   */
  // 解析通过httpclient包收到的entity
  public void load(HttpEntity entity) throws Exception {

    contentType = null;
    Header type = entity.getContentType();
    if (type != null) {
      contentType = type.getValue();
    }

    contentEncoding = null;
    Header encoding = entity.getContentEncoding();
    if (encoding != null) {
      contentEncoding = encoding.getValue();
    }

    Charset charset = ContentType.getOrDefault(entity).getCharset();
    if (charset != null) {
      contentCharset = charset.displayName();
    }

    contentData = EntityUtils.toByteArray(entity);
  }

  public WebURL getWebURL() {
    return url;
  }

  public void setWebURL(WebURL url) {
    this.url = url;
  }

  public boolean isRedirect() {
    return redirect;
  }

  public void setRedirect(boolean redirect) {
    this.redirect = redirect;
  }

  public String getRedirectedToUrl() {
    return redirectedToUrl;
  }

  public void setRedirectedToUrl(String redirectedToUrl) {
    this.redirectedToUrl = redirectedToUrl;
  }

  public int getStatusCode() {
    return statusCode;
  }

  public void setStatusCode(int statusCode) {
    this.statusCode = statusCode;
  }

  /**
   * Returns headers which were present in the response of the fetch request
   *
   * @return Header Array, the response headers
   */
  public Header[] getFetchResponseHeaders() {
    return fetchResponseHeaders;
  }

  public void setFetchResponseHeaders(Header[] headers) {
    fetchResponseHeaders = headers;
  }

  /**
   * @return parsed data generated for this page by parsers
   */
  public ParseData getParseData() {
    return parseData;
  }

  public void setParseData(ParseData parseData) {
    this.parseData = parseData;
  }

  /**
   * @return content of this page in binary format.
   */
  public byte[] getContentData() {
    return contentData;
  }

  public void setContentData(byte[] contentData) {
    this.contentData = contentData;
  }

  /**
   * @return ContentType of this page.
   * For example: "text/html; charset=UTF-8"
   */
  public String getContentType() {
    return contentType;
  }

  public void setContentType(String contentType) {
    this.contentType = contentType;
  }

  /**
   * @return encoding of the content.
   * For example: "gzip"
   */
  public String getContentEncoding() {
    return contentEncoding;
  }

  public void setContentEncoding(String contentEncoding) {
    this.contentEncoding = contentEncoding;
  }

  /**
   * @return charset of the content.
   * For example: "UTF-8"
   */
  public String getContentCharset() {
    return contentCharset;
  }

  public void setContentCharset(String contentCharset) {
    this.contentCharset = contentCharset;
  }

  /**
   * @return Language
   */
  public String getLanguage() {
    return language;
  }

  public void setLanguage(String language) {
    this.language = language;
  }
}
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
好的,以下是一个简单的Java网络爬虫程序的示例代码: ```java import java.io.BufferedReader; import java.io.InputStreamReader; import java.net.URL; import java.util.HashSet; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; public class WebCrawler { private Set<String> visitedUrls = new HashSet<>(); private Pattern pattern = Pattern.compile("href=\"(http.*?)\""); public void crawl(String url) { try { visitedUrls.add(url); System.out.println("正在爬取网页:" + url); URL currentUrl = new URL(url); BufferedReader reader = new BufferedReader(new InputStreamReader(currentUrl.openStream())); String line; while ((line = reader.readLine()) != null) { Matcher matcher = pattern.matcher(line); while (matcher.find()) { String nextUrl = matcher.group(1); if (!visitedUrls.contains(nextUrl)) { crawl(nextUrl); } } } reader.close(); } catch (Exception e) { System.out.println("爬取网页出错:" + e.getMessage()); } } public static void main(String[] args) { WebCrawler crawler = new WebCrawler(); crawler.crawl("http://www.example.com"); } } ``` 这个程序使用递归的方式来实现爬取网页。它首先访问指定的初始网页,然后从网页源代码中提取出所有的链接,并递归地访问这些链接。 在这个程序中,使用了java.net包中的URL和URLConnection来进行网络连接,并使用java.util.regex包中的Pattern和Matcher来解析网页源代码中的链接。 当然,这只是一个非常简单的爬虫程序,实际应用中还需要考虑很多因素,如URL去重、爬取深度控制、并发爬取等等。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值