Jsoup提取指定table中所有td的内容

最新推荐文章于 2024-04-30 14:28:06 发布

Spring_LGF

最新推荐文章于 2024-04-30 14:28:06 发布

阅读量8.4k

点赞数 1

分类专栏： jsoup 文章标签： jsoup抓取指定table中所有td内

本文链接：https://blog.csdn.net/winnerspring/article/details/30225553

版权

jsoup 专栏收录该内容

9 篇文章 0 订阅

订阅专栏

由于数据抓取的需要，将网页下载完后所有的都是html源码，需要抓取某一信息时，需要对html做特定的分析，然后按照class或者id进行抓取。如果不了解抓取页面的html标签时，所得到的信息是非常的难看，也很难从其中发现有用的信息。也是现在我开始做页面挖掘最大的难点。

以下代码是结合前面对html页面空格处理以及特定抓取和存储的代码:

下载页面代码：

</pre><pre class="java" name="code">package com.dazhihui;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

public class MyJsoup {
	public static boolean downloadPage(String url, File file){
		try {
			Document doc = Jsoup.connect(url).data("jquery","java").userAgent("Mozilla").cookie("auth", "tiken").timeout(5000).get();
			String pageHtml = doc.toString();
			OutputStream out = new FileOutputStream(file);
			out.write(pageHtml.toString().getBytes());
			out.close();	
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			return false;
		}
		return true;
	}
}

处理空格代码：

package com.dazhihui;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;

public class ReplaceAllFileString {
	//参数说明：oldFile为所需要替换的文件，即为原文件；   newFile为替换后新的文件 ；oldString为所需要替换的字符串；newString为替换字符串
	public static boolean replaceAllFileString(File oldFile, File newFile, String oldString, String newString){
		try {
		BufferedReader reader = new BufferedReader(new FileReader(oldFile));
		BufferedWriter writer = new BufferedWriter(new FileWriter(newFile));
		String teamString = null;
		while((teamString = reader.readLine()) != null){
			String str = teamString.replaceAll(oldString, newString);
			writer.write(str);
		}
		reader.close();
		writer.close();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			return false;
		}
		return true;
	}
}

提取大智慧公司概况代码：

package com.dazhihui;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class DazhihuiResolveCompanyProfile {
	public static ArrayList<String> resolvePageText(File file){
		ArrayList<String> list=null;
		try {
			Document doc = Jsoup.parse(file, "GBK");
			Elements elements = doc.getElementsByClass("table_style_e");
			list = new ArrayList<String>();
			//select("table#table_style_e");
			for(Element element:elements){
				if(element.text()!=null&& !"".equals(element.text())){
					Elements es = element.select("tr");
					for(Element tdelement:es){
						Elements tdes = tdelement.select("td");
						for(int i = 0; i < tdes.size(); i++){
							list.add(tdes.get(i).text());
							//System.out.println(tdes.get(i).text());
						}
					}
				}
			}
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}	
		return list;
	}
}

主代码：

package com.dazhihui;

import java.io.File;
import java.util.ArrayList;


public class Dazhihui {

	
	public static void main(String[] args) {
		// TODO Auto-generated method stub
		String url = "http://cj.gw.com.cn/news/stock/601288.shtml";
		File file = new File("C:/myjsoup/dazhihui/dazhihui.txt");
		File newFile = new File("C:/myjsoup/dazhihui/newdazhihui.txt");
		boolean mark =  MyJsoup.downloadPage(url, file);
		System.out.println(mark);
		boolean mark2 = ReplaceAllFileString.replaceAllFileString(file, newFile, " ", "");
		System.out.println(mark2);
		ArrayList<String> list =DazhihuiResolveCompanyProfile.resolvePageText(newFile);
		for(int i = 0; i < list.size(); i++){
			System.out.println(list.get(i));
		}
	}

}