Java---Jsou或正则-抽取网页信息

利用Jsoup、正则抽取网页的标题、作者、发布时间以及正文
源网站
利用Jsoup进行抽取

public static void main(String[] args) {
		String url = "https://auto.gasgoo.com/a/70125507.html";
		String sourceStr = getHTML(url, "UTF-8");
		System.out.println(getTitle(sourceStr));
		System.out.println(getTime(sourceStr));
		System.out.println(getAuthor(sourceStr));
		System.out.println(getContent(sourceStr).get(0));
		
	}
	
	/**
	 * 获取时间
	 * @param sourceStr
	 * @return
	 */
	public static String getTime(String sourceStr) {
		Document doc = Jsoup.parse(sourceStr);
		//根据元素的Class来获取其使用该种class的元素Elements
		//若根据id来选,因为id是唯一标志,故只返回一个Element
		if (doc.getElementsByClass("userInfo").size()!=0) {
			return doc.getElementsByClass("userInfo").get(0).getElementsByTag("span").get(0).text();
		}else {
			return doc.getElementsByClass("pageInfo").get(0).getElementsByTag("span").get(1).text();
		}
	}
	
	/**
	 * 获取作者信息,因为网站具有多种包含作者的标签样式,故需分别处理
	 * @param sourceStr
	 * @return
	 */
	public static String getAuthor(String sourceStr) {
		Document doc = Jsoup.parse(sourceStr);
		//根据元素的Class来获取其使用该种class的元素Elements
		//若根据id来选,因为id是唯一标志,故只返回一个Element
		Elements author2 = doc.getElementsByClass("scribe").get(0).getElementsByTag("a");
		
		if (doc.getElementsByClass("userInfo").size()!=0) {
			Elements author1 = doc.getElementsByClass("userInfo").get(0).getElementsByTag("a");
			if (author1.size()==2) {
				return author1.get(1).text();
			}else {
				return author1.get(0).text();
			}
		}else {
			return doc.getElementsByClass("scribe").get(0).getElementsByTag("a").get(0).text();
		}
	}
	
	/**
	 * 获取标题
	 * @param sourceStr
	 * @return
	 */
	public static String getTitle(String sourceStr) {
		Document doc = Jsoup.parse(sourceStr);
		//根据元素的Class来获取其使用该种class的元素Elements
		//若根据id来选,因为id是唯一标志,故只返回一个Element
		if (doc.getElementsByClass("detailed").size()!=0) {
			return doc.getElementsByClass("detailed").get(0).getElementsByTag("h1").get(0).text();
		}else {
			return doc.getElementsByClass("articleNR fl").get(0).getElementsByTag("h1").get(0).text();
		}
	}
	
	/**
	 * 返回正文内容
	 * @param sourceStr
	 * @return
	 */
	public static List<String> getContent(String sourceStr) {
		List<String> list = new ArrayList<>();
		Document document = Jsoup.parse(sourceStr);
		//正文块由id唯一标识
		Element content = document.getElementById("ArticleContent");
		Elements links = content.getElementsByTag("p");
		for (Element link : links) {
			String linkText = link.text();
			if (!linkText.equals("")) {
				list.add(linkText);
//				System.out.println(linkText);
			}
		}
		return list;
	}
	/**
	 * 根据网址获取网页源代码
	 * @param pageURL
	 * @param charset
	 * @return
	 */
	public static String getHTML(String pageURL, String charset) {
		StringBuilder pageHTML = new StringBuilder();
		try {
			URL url = new URL(pageURL);
			HttpURLConnection connection = (HttpURLConnection) url.openConnection();
			connection.setRequestProperty("User-Agent",
					"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.76 Safari/537.36");
			// connection.setRequestProperty("cookie",cookie);
			BufferedReader br = new BufferedReader(new InputStreamReader(connection.getInputStream(), charset));
			String line = null;
			while ((line = br.readLine()) != null) {
				pageHTML.append(line);
				pageHTML.append("\r\n");
			}
			connection.disconnect();
		} catch (Exception e) {
			e.printStackTrace();
		}
		return pageHTML.toString();
	}

利用正则表达式进行抽取

/**
	 * 从网址中获取id
	 * @return
	 */
	public String getId(String url) {
		String id1 = "";
		String id2 = "";
		
		id1 = RegexTools.getRegexResult(url, ".+\\/(.*?).shtml");
		id2 = RegexTools.getRegexResult(url, ".+\\/(.*?).html");
		
		if (!id1.equals("")) {
			return id1;
		} else {
			return id2;
		}
	}
	
	public String getTitle(String sourceStr) {
		String title1 = "";
		String title2 = "";
		
		title1 = RegexTools.getRegexResult(sourceStr, "<div\\sclass=\"detailed\">.*<h1>(.*?)</h1>");
		title2 = RegexTools.getRegexResult(sourceStr, "<div\\sclass=\"articleNR\\sfl\">\\s*<h1>(.*?)</h1>");
		
		if (!title1.equals("")) {
			return title1;
		} else {
			return title2;
		}
	}
	
	public String getTime(String sourceStr) {
		String time1 = "";
		String time2 = "";
		
		time1 = RegexTools.getRegexResult(sourceStr, "<div\\sclass=\"userInfo\">.*?<span>(.*?)</span>.*?</div>");
		time2 = RegexTools.getRegexResult(sourceStr, "</a>\\s*</span>\\s*<span>(.*?)</span>\\s*<span\\sclass=\"scribe\">");
		if (!time1.equals("")) {
			return time1;
		} else {
			return time2;
		}
	}
	
	
	/**
	 * 获取作者信息,因为网站具有多种包含作者的标签样式,故需分别处理
	 * @param sourceStr
	 * @return
	 */
	public String getAuthor(String sourceStr) {
		String author1 = "";
		String author2 = "";
		String author3 = "";
		String author4 = "";
		
		author1 = RegexTools.getRegexResult(sourceStr, "<div\\sclass=\"userInfo\">.*?<a.*?class=\"editor\">(.*?)</a>.*?</div>");
		author2 = RegexTools.getRegexResult(sourceStr, "<div\\sclass=\"userInfo\">.*?<a.*?>(.*?)</a>.*?</div>");
		author3 = RegexTools.getRegexResult(sourceStr, "<div\\sclass=\"userInfo\">\\s+(.*?)\\s+<span>");
		author4 = RegexTools.getRegexResult(sourceStr, "<span\\sclass=\"scribe\">\\s*<a\\shref=.*?>\\s*<img\\s.*?>(.*?)\\s*</a>");
		
		if (!author1.equals("")) {
			return  author1;
		} else if (!author2.equals("")) {
			return author2;
		} else if (!author3.equals("")) {
			return author3;
		} else{
			return author4;
		}
	}
	
	/**
	 * 返回正文内容
	 * @param sourceStr
	 * @return
	 */
	public List<String> getContent(String sourceStr) {
		List<String> list = new ArrayList<>();
		Document document = Jsoup.parse(sourceStr);
		//正文块由id唯一标识
		Element content = document.getElementById("ArticleContent");
		Elements links = content.getElementsByTag("p");
		for (Element link : links) {
			String linkText = link.text();
			if (!linkText.equals("")) {
				list.add(linkText);
//				System.out.println(linkText);
			}
		}
		return list;
	}

完!

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值