java 抓取网页内容小工具

最新推荐文章于 2024-04-30 14:28:06 发布

FieldSoft-HelloClyde

最新推荐文章于 2024-04-30 14:28:06 发布

阅读量1.7k

点赞数 2

分类专栏：编程语言基础文章标签： java 网页抓取 excel

本文链接：https://blog.csdn.net/kydkong/article/details/46892925

版权

编程语言基础专栏收录该内容

7 篇文章 0 订阅

订阅专栏

之超同志今天问我会不会抓网页的内容，然后把一些表格整理成excel。

好吧，我是不会的，但是我想试试，结果还是可行的。

先说说他的需求吧，他需要把http://www.zjex.com.cn/view/company.php?func=listAll&catalog=0401&page=1这个网站里的所有公司信息都存在一张excel表格里。

我之前没有使用java抓过网页内容，不过用vb.net写过，稍微知道一些。

我搜索到了这篇文章的内容：http://blog.csdn.net/xiaoyu411502/article/details/4500806

然后对其代码删减了点，就可以抓网页的源代码了。

github地址：https://github.com/FieldSoft-HelloClyde/NetGrab

对于其他网址有相应的适配版本

下面是程序执行的第一步，抓取第一页到第22页的内容，把公司子网页的链接存下来。

import java.io.BufferedInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;


public class Main {
	/**
	 * 主函数
	 * @param args
	 */
	static public void main(String[] args){
		String SrcWebSiteUrl = new String("http://www.zjex.com.cn/view/company.php?func=listAll&catalog=0401&page=");
		int WebSitePageId = 1;
		ArrayList<String> WebUrlArrayList = new ArrayList<String>();
		for (WebSitePageId = 1;WebSitePageId <= 22;WebSitePageId ++){
			//抓取网页内容
			String WebSiteString;
			WebSiteString = GetWebSiteSourceCode("http://www.zjex.com.cn/view/company.php?func=listAll&catalog=0401&page=" + WebSitePageId);
			//System.out.println(WebSiteString);
			GetWebUrl(WebSiteString,WebUrlArrayList);
		}
	}
	
	
	/**
	 * 从网页源代码中获取需要的网址，存入列表中
	 * @param WebSiteString
	 * @return
	 */
	public static void GetWebUrl(String WebSiteString,ArrayList<String> DesArray){
		String KeyString = new String("<div class=\"comimg fl\"><a href=\"");
		String EndString = new String("\">");
		int SearchFromIndex = 0;
		int SearchIndex;
		SearchIndex = WebSiteString.indexOf(KeyString, SearchFromIndex);
		while (SearchIndex != -1){
			String TempUrl;
			int StringEndIndex;
			StringEndIndex = WebSiteString.indexOf(EndString, SearchIndex + KeyString.length());
			TempUrl = WebSiteString.substring(SearchIndex + KeyString.length(), StringEndIndex);
			System.out.println(TempUrl);
			DesArray.add(new String(TempUrl));
			SearchFromIndex = StringEndIndex + EndString.length();
			SearchIndex = WebSiteString.indexOf(KeyString, SearchFromIndex);
		}
	}
	
	/**
	 * 获取指定网页内容
	 * @param url
	 * @return
	 */
	public static String GetWebSiteSourceCode(String url) {  
        StringBuffer temp = new StringBuffer(); 
        try {  
            HttpURLConnection uc = (HttpURLConnection)new URL(url).  
                                   openConnection();  
            uc.setConnectTimeout(10000);  
            uc.setDoOutput(true);  
            uc.setRequestMethod("GET");  
            uc.setUseCaches(false);  

            InputStream in = new BufferedInputStream(uc.getInputStream());  
            Reader rd = new InputStreamReader(in, "utf-8");  
            int c = 0;  
            while ((c = rd.read()) != -1) {  
                temp.append((char) c);  
            }  
            in.close();  
            
        } catch (Exception e) {  
            e.printStackTrace();  
        }  
        return temp.toString();  
    }
}

然后是第二步内容，根据第一步的url，对网页的表格内容进行抓取，保存到txt文档中，每一项都用空格分开。

import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.PrintWriter;
import java.util.Scanner;


public class GetTable {
	public static void main(String[] args){
		try {
			String UrlTextFileName = new String("d:/url.txt");
			Scanner FileScanner = new Scanner(new FileReader(UrlTextFileName));
			PrintWriter DesTxt = new PrintWriter("d:/table.txt");
			String[] TypeStrArray = {"公司名称","公司简称","企业代码","法人代表","挂牌日期","董秘","董秘电话","董秘QQ","董秘邮箱","注册日期"
									,"注册资本","推荐机构","会计师事务所","所属板块","所属行业","所在地","注册地址","办公地址","公司网址","经营范围"};
			//输出表头
			for (String TempStr:TypeStrArray){
				DesTxt.print(TempStr + " ");
			}
			DesTxt.println();
			
			while (FileScanner.hasNextLine()){
				String WebUrl = FileScanner.nextLine();
				String WebSiteString = Main.GetWebSiteSourceCode("http://www.zjex.com.cn" + WebUrl);
				ReadTable(WebSiteString,DesTxt,TypeStrArray);
			}
			DesTxt.close();
		} catch (FileNotFoundException e) {
			// TODO 自动生成的 catch 块
			e.printStackTrace();
		}
	}
	
	public static void ReadTable(String WebSiteString,PrintWriter DesPW,String[] TypeArray){
		int ArrayIndex = 0;
		for (ArrayIndex = 0;ArrayIndex < TypeArray.length;ArrayIndex ++){
			int SearchIndex = WebSiteString.indexOf(TypeArray[ArrayIndex]);
			//读取下一行
			int StartIndex = WebSiteString.indexOf((char)13, SearchIndex) + 1;
			int EndIndex = WebSiteString.indexOf((char)13,StartIndex);
			String TempStr = WebSiteString.substring(StartIndex, EndIndex);
			//对获取的字符串处理
			String DesStr = GetValueString(TempStr);
			System.out.println(DesStr);
			DesPW.print(DesStr + " ");
		}
		DesPW.println();
	}
	
	public static String GetValueString(String Src){
		StringBuffer DesStr = new StringBuffer("");
		boolean IsBlock = false;
		for (int index = 0;index < Src.length();index ++){
			char TempChar = Src.charAt(index);
			if (IsBlock){
				if (TempChar == '>'){
					IsBlock = false;
				}
			}
			else{
				if (TempChar == '<'){
					IsBlock = true;
				}
				else{
					DesStr.append(TempChar);
				}
			}
		}
		return DesStr.toString().trim();
	}
}

保存在txt里后，用excel打开这个txt就会出现向导，跟着向导，选择使用分隔符分割，分隔符选择空格，然后把每一列类型都改为文本。

好了，任务完成了

贴一下成果