package org.baidu.crawl;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.Buffer;
import java.util.Iterator;
import javax.print.Doc;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Myreader {
//获取百度百科中的词条正文和词条便签
public static void CrawlBaidu(String url) throws IOException{
BufferedWriter bWriter = new BufferedWriter(new FileWriter("D:\\词条\\11"));
String regex = "\\[.\\d*\\]";
//String regex1 = "\\[.\\]\\s\\S";
Document document = Jsoup.connect(url).get();
Elements elementsTop = document.select("div.poster-top");
Elements elementsTopText = elementsTop.select("div.lemma-summary");
String stringText = elementsTopText.text();
String lastElementTopText = stringText.replaceAll(regex, "");
Elements elements = document.select("div.main-content");
if(elements.size()!=0){
Elements elements2 = elements.select("div.para");
if(elements2.size()!=0){
String strText = elements2.text();
String replace = strText.replaceAll(regex, "");
Document parse = Jsoup.parse(strText);
// String replaceAll = parse.body().text().replaceAll(regex, "");
Element elementById = document.getElementById("open-tag-item");
bWriter.write(lastElementTopText+replace+"\n"+"词条便签:" +elementById.text());
//当百度百科正文没有词条标签的时候,注释上面的write(),放开下面的的write()
bWriter.newLine();
bWriter.flush();
}
bWriter.close();
}
}
//获取正文中的超链接
public static void hyperLink(String url) throws IOException{
BufferedWriter bWriter = new BufferedWriter(new FileWriter("D:\\词条\\11链接"));
Document document = Jsoup.connect(url).get();
Elements elements = document.select("div.main-content");
if(elements.size()!=0){
Elements elements2 = elements.select("div.para");
if(elements2.size()!=0){
Elements elements3 = elements2.select("a[href]");
String text = elements3.html();
// System.out.println(text);
bWriter.write(text);
bWriter.newLine();
bWriter.flush();
}
}
bWriter.close();
}
//获取tbody要素
public static void crawlTbody(String url) throws IOException {
BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter("D:\\词条\\tbody"));
Document document = Jsoup.connect(url).get();
}
//一行一行的读取数据,并解决中文乱码问题,超链接字段与正文匹配,并把正文中超链接字段表示为[[超链接字段]]
public static void modifyHyperLink(String bodyText,String linkText,String url) throws IOException{
//输出最终路径
// BufferedWriter bWriter = new BufferedWriter(new FileWriter("D:\\词条\\last\\滇池"));
//百度百科词条的解释正文
FileInputStream fInputStreamBody = new FileInputStream(bodyText);
//设定输入字段的格式,防止中文乱码
InputStreamReader iStreamReaderBody = new InputStreamReader(fInputStreamBody,"UTF-8");
//缓冲字符输入流
BufferedReader bufferedReaderBody = new BufferedReader(iStreamReaderBody);
// //下面这一句和上面三句是同一个意思
// BufferedReader bReader = new BufferedReader(new InputStreamReader(new FileInputStream(bodyText), "UTF-8"));
String bodyLine = null;
bodyLine = bufferedReaderBody.readLine();
BufferedWriter bWriter = new BufferedWriter(new FileWriter("D:\\词条\\高校\\南开大学"));
//正文中的超链接
FileInputStream fInputStreamLink = new FileInputStream(linkText);
InputStreamReader iReaderLink = new InputStreamReader(fInputStreamLink);
BufferedReader bReaderLink = new BufferedReader(iReaderLink);
//一行一行读取超链接字段
String linkLine = null;
while((linkLine = bReaderLink.readLine())!=null){
bodyLine = bodyLine.replaceAll(linkLine, "[[" + linkLine + "]]");
}
//获取词条标签
Document document = Jsoup.connect(url).get();
Element element = document.getElementById("open-tag-item");
String fieldTag = element.text();
bWriter.write(bodyLine+ "\n" +"词条标签:" + fieldTag);
// bWriter.write(bodyLine);
bWriter.newLine();
bWriter.flush();
bWriter.close();
// System.out.println(bodyLine);
}
public static void main(String[] args) throws IOException {
String url = "http://baike.baidu.com/item/%E5%8D%97%E5%BC%80%E5%A4%A7%E5%AD%A6/134521";
CrawlBaidu(url);
hyperLink(url);
String bodyText = "D:\\词条\\11";
String linkText = "D:\\词条\\11链接";
modifyHyperLink(bodyText, linkText,url);
}
}
抓取百度百科数据
最新推荐文章于 2024-02-29 16:55:32 发布