由于数据抓取的需要,将网页下载完后所有的都是html源码,需要抓取某一信息时,需要对html做特定的分析,然后按照class或者id进行抓取。如果不了解抓取页面的html标签时,所得到的信息是非常的难看,也很难从其中发现有用的信息。也是现在我开始做页面挖掘最大的难点。
以下代码是结合前面对html页面空格处理以及特定抓取和存储的代码:
下载页面代码:
</pre><pre class="java" name="code">package com.dazhihui;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
public class MyJsoup {
public static boolean downloadPage(String url, File file){
try {
Document doc = Jsoup.connect(url).data("jquery","java").userAgent("Mozilla").cookie("auth", "tiken").timeout(5000).get();
String pageHtml = doc.toString();
OutputStream out = new FileOutputStream(file);
out.write(pageHtml.toString().getBytes());
out.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return false;
}
return true;
}
}
处理空格代码:
package com.dazhihui;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
public class ReplaceAllFileString {
//参数说明:oldFile为所需要替换的文件,即为原文件; newFile为替换后新的文件 ;oldString为所需要替换的字符串;newString为替换字符串
public static boolean replaceAllFileString(File oldFile, File newFile, String oldString, String newString){
try {
BufferedReader reader = new BufferedReader(new FileReader(oldFile));
BufferedWriter writer = new BufferedWriter(new FileWriter(newFile));
String teamString = null;
while((teamString = reader.readLine()) != null){
String str = teamString.replaceAll(oldString, newString);
writer.write(str);
}
reader.close();
writer.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return false;
}
return true;
}
}
提取大智慧公司概况代码:
package com.dazhihui;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class DazhihuiResolveCompanyProfile {
public static ArrayList<String> resolvePageText(File file){
ArrayList<String> list=null;
try {
Document doc = Jsoup.parse(file, "GBK");
Elements elements = doc.getElementsByClass("table_style_e");
list = new ArrayList<String>();
//select("table#table_style_e");
for(Element element:elements){
if(element.text()!=null&& !"".equals(element.text())){
Elements es = element.select("tr");
for(Element tdelement:es){
Elements tdes = tdelement.select("td");
for(int i = 0; i < tdes.size(); i++){
list.add(tdes.get(i).text());
//System.out.println(tdes.get(i).text());
}
}
}
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return list;
}
}
主代码:
package com.dazhihui;
import java.io.File;
import java.util.ArrayList;
public class Dazhihui {
public static void main(String[] args) {
// TODO Auto-generated method stub
String url = "http://cj.gw.com.cn/news/stock/601288.shtml";
File file = new File("C:/myjsoup/dazhihui/dazhihui.txt");
File newFile = new File("C:/myjsoup/dazhihui/newdazhihui.txt");
boolean mark = MyJsoup.downloadPage(url, file);
System.out.println(mark);
boolean mark2 = ReplaceAllFileString.replaceAllFileString(file, newFile, " ", "");
System.out.println(mark2);
ArrayList<String> list =DazhihuiResolveCompanyProfile.resolvePageText(newFile);
for(int i = 0; i < list.size(); i++){
System.out.println(list.get(i));
}
}
}