太平洋网页解析全程

最新推荐文章于 2021-01-14 15:15:15 发布

caoxu1987728

最新推荐文章于 2021-01-14 15:15:15 发布

阅读量1k

点赞数

分类专栏： Search Engine 文章标签： string exception newline attributes file image

本文链接：https://blog.csdn.net/caoxu1987728/article/details/2685602

版权

Search Engine 专栏收录该内容

28 篇文章 0 订阅

订阅专栏

package com.luceneheritrixbook.extractor.pconline.mobile;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Date;

import org.htmlparser.NodeFilter;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NotFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.util.NodeList;

import com.luceneheritrixbook.extractor.Extractor;
import com.luceneheritrixbook.util.StringUtils;

public class ExtractPconlineMoblie extends Extractor 
{
	public void extract() 
	{
		BufferedWriter bw = null;
		//创建属性过滤器
		NodeFilter attributes_filter = new AndFilter(new TagNameFilter("td"),
				new HasAttributeFilter("WIDTH", "198"));
		//创建标题过滤器
		NodeFilter title_filter = new AndFilter(new TagNameFilter("td"),
				new AndFilter(new HasAttributeFilter("class", "hi"),
						new NotFilter(new HasAttributeFilter("width"))));
		//创建图片过滤器
		NodeFilter image_filter = new AndFilter(new TagNameFilter("IMG"),
				new AndFilter(new HasAttributeFilter("WIDTH", "200"),
						new HasAttributeFilter("HEIGHT","150")));
		//提取标题信息
		try {
			//Parser根据过滤器返回所有满足过滤条件的节点
			NodeList title_nodes = this.getParser().parse(title_filter);
			//遍历所有节点
			for (int i = 0; i < title_nodes.size(); i++) {
				TableColumn node = (TableColumn) title_nodes.elementAt(i);
				//用空格分割节点内的html文本
				String[] names = node.getChildrenHTML().split(" ");
				StringBuffer title = new StringBuffer();
				//创建要生成的文本文件名
				for (int k = 0; k < names.length; k++) {
					title.append(names[k]).append("-");
				}
				title.append((new Date()).getTime());
				//创建要生成的文件
				bw = new BufferedWriter(new FileWriter(new File(this.getOutputPath()
						+ title + ".txt")));
				 /*
				  * 注意在这里this.getOutputPath()的路径为C:/product/mobile
				  * 而后面跟着的完全是这个路径下的文件
				  * */
				//获取当前提取页的完整URL地址
				int startPos =getInuputFilePath().indexOf("data") + 6;
				//调试结果为：startPos=9
				String url_seg = getInuputFilePath().substring(startPos-1);
				url_seg = url_seg.replaceAll("", "/");
				String url = "http://" + url_seg;
				System.out.println(url);   //这个纯粹是测试用的
				//写入当前提取页的完整URL地址
				bw.write(url + NEWLINE);
				bw.write(names[0] + NEWLINE);
				bw.write(names[1] + NEWLINE);
				/*
				 * 把上面的这些内容全部写入文件中，应该下面还有其他内容
				 * */
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
		//重置Parser
		this.getParser().reset();
		try {
			//Parser根据过滤器返回所有满足过滤条件的节点
			NodeList attributes_nodes = this.getParser().parse(attributes_filter);
			for (int i = 0; i < attributes_nodes.size(); i++) {
				//Parser根据过滤器返回所有满足过滤条件的节点
				TableColumn node = (TableColumn) attributes_nodes.elementAt(i);
			  //String text = node.getChildrenHTML();   //不知道为什么没有用到它
				//提取属性名信息
				String result = getProp(  //result=型号
						"(.*)",
						node.toHtml(), 1);
				/*
				 * node.toHtm（）的内容为：　　型号
				 * */
				//属性里面包含有link标签的情况
				if (result.indexOf("<") != -1)
					result = getProp(
							"(.*)",
							node.toHtml(), 2);
				//提取属性值信息
				TableColumn nodeExt = (TableColumn) node.getNextSibling().getNextSibling();
				/*
				 * node.getNextSibling()所得为一个TextNode对象，结果为Txt (28139[543,92],28175[544,34]): /n
				 * node.getNextSibling().getNextSibling()所得为一个TableColumn对象，结果为：
				 * Tag (28175[544,34],28218[544,77]): TD ALIGN=left WIDTH=316 BGCOLOR="#ffffff"
                   Txt (28218[544,77],28227[544,86]): 　SCH-F209
                   End (28227[544,86],28232[544,91]): /TD
                      此结果为node的下一行代码。
                 */
				
				bw.write(StringUtils.trim(result) + ":"
						+ StringUtils.trim(nodeExt.getChildrenHTML()));
				bw.newLine();
				continue;
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
		//		重置Parser
		this.getParser().reset();
		try 
		{
			//Parser根据过滤器返回所有满足过滤条件的节点
				NodeList image_nodes = this.getParser().parse(image_filter);
			/*
			 * 看一下一个网页的例子吧
			 *   
			 *   */
			for (int i = 0; i < image_nodes.size(); i++) {  //image_nodes.size()=1
				ImageTag node = (ImageTag) image_nodes.elementAt(i);
				//获取当前节点的SRC属性值
					String image_url = node.getAttribute("SRC");
					//image_url=https://i-blog.csdnimg.cn/blog_migrate/f035dc354164849a96e59942b27551ae.jpeg
					//提取文件类型
					String fileType = image_url.substring(image_url
							.lastIndexOf(".") + 1);
					//fileType=jpg.
					//生成新的图片的文件名
					String new_iamge_file = StringUtils.encodePassword(
							image_url, HASH_ALGORITHM)
							+ "." + fileType;
				    /*
				     * 经过加密处理后：
				     * new_image_file=f035dc354164849a96e59942b27551ae.jpg
				     * */
					image_url = StringUtils.replace(image_url, "+", " ");
					//利用miorr目录下的图片生成的新的图片
					copyImage(image_url, new_iamge_file);
					bw.write(SEPARATOR + NEWLINE);
					bw.write(new_iamge_file + NEWLINE);
			}
		} catch (Exception e) 
		{
			e.printStackTrace();
		}
		try
		{
			if (bw != null)
				bw.close();
		}catch(IOException e)
		{
			e.printStackTrace();
    	}		
	}
	
	public static void main(String[] args) throws Exception 
	{

		Extractor extractor = new ExtractPconlineMoblie();
		extractor.setOutputPath("c://product//mobile//");
		extractor.setImageDir("c://product//image//");
		extractor.setMirrorDir("F://");
   /*	extractor.copyImage("https://i-blog.csdnimg.cn/blog_migrate/f035dc354164849a96e59942b27551ae.jpeg",
		"f035dc354164849a96e59942b27551ae.jpg");
		以上这个例子完全是为了测试而用。
	*/
		
		traverse(extractor, new File("F://data//product.pconline.com.cn//product"));
		//System.out.println(count);

	}
}

注：图片及产品信息全部放在F://盘

基类Extractor：

package com.luceneheritrixbook.extractor;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.htmlparser.Parser;

//import com.luceneheritrixbook.extractor.pconline.mobile.ExtractPconlineMoblie;

public abstract class Extractor {

	protected static final String NEWLINE = "/r/n";

	/**
	 * 表示所有结果的输出路径
	 */
	private String outputPath = "";

	/**
	 * 表示当前正在被处理的文件
	 */
	private String inuputFilePath;

	/**
	 * 表示当前所有被抓取的网页的镜象根目录 在Heritrix用mirror目录表示
	 */
	private String mirrorDir = "";

	/**
	 * 用于存放被处理过后的产口的图片的目录
	 */
	private String imageDir = "";

	/**
	 * HTMLParser的实例
	 */
	private Parser parser;

	/**
	 * 对图片路径进行哈希的算法，这里采用MD5算法
	 */
	protected static final String HASH_ALGORITHM = "md5";

	/**
	 * 分隔符
	 */
	public static final String SEPARATOR = "======================";

	/**
	 * 装载需要的网页文件
	 * 
	 */
	public void loadFile(String path) {
		try {
			parser = new Parser(path);
			inuputFilePath = path;
			parser.setEncoding("GBK");
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	/**
	 * 获取输出的路径
	 */
	public String getOutputPath() {
		return outputPath;
	}

	/**
	 * 设置输出的路径，通常在初始化Extractor时就应该做
	 */
	public void setOutputPath(String outputPath) {
		this.outputPath = outputPath;
	}

	public Parser getParser() {
		return parser;
	}

	/**
	 * 使用正则来匹配并获得网页中的字符串
	 */
	protected String getProp(String pattern, String match, int index) {
		Pattern sp = Pattern.compile(pattern);
		Matcher matcher = sp.matcher(match);
		while (matcher.find()) {
			return matcher.group(index);
		}
		return null;
	}

	/**
	 * 抽象方法，用于供子类实现。 其功能主要是解释网页文件 将产品信息保存到
	 * 
	 */
	public abstract void extract();

	/**
	 * 获取正在处理的文件的路径
	 */
	public String getInuputFilePath() {
		return inuputFilePath;
	}

	/**
	 * 从mirror目录下拷贝文件至所设定的图片目录
	 * 该方法可能需要被改变
	 */
	
/*	public static void main(String[] args)
	{
		Extractor e=new Extractor();
		e.copyImage("https://i-blog.csdnimg.cn/blog_migrate/f035dc354164849a96e59942b27551ae.jpeg",
				"f035dc354164849a96e59942b27551ae.jpg");
	}
*/
	public boolean copyImage(String image_url, String new_image_file) 
	{

		String dirs = image_url.substring(7);

		try { 
			// instance the File as file_in and file_out
			File file_in = new File(new File(mirrorDir), dirs);
			if (file_in == null || !file_in.exists()) {
				file_in = new File("e://noimage.jpg");
			}
			
			File file_out = new File(new File(imageDir), new_image_file);

			FileInputStream in1 = new FileInputStream(file_in);
			FileOutputStream out1 = new FileOutputStream(file_out);

			byte[] bytes = new byte[1024];
			int c;
			while ((c = in1.read(bytes)) != -1)
				out1.write(bytes, 0, c);
			/*
			 * 原来图片读入也是用字符流！！*/

			// close
			in1.close();
			out1.close();
			return (true); // if success then return true
		} catch (Exception e) {
			e.printStackTrace();
			return (false); // if fail then return false
		}
	}

	public String getImageDir() {
		return imageDir;
	}

	public void setImageDir(String imageDir) {
		this.imageDir = imageDir;
	}

	public String getMirrorDir() {
		return mirrorDir;
	}

	public void setMirrorDir(String mirrorDir) {
		this.mirrorDir = mirrorDir;
	}

	public void setInuputFilePath(String inuputFilePath) {
		this.inuputFilePath = inuputFilePath;
	}

//	public static void main(String[] args) throws Exception {
//
//		Extractor extractor = new Extract163Moblie();
//		extractor.setOutputPath("c://product//test//mobile//");
//		extractor.setImageDir("c://product//test//image//");
//		extractor.setMirrorDir("F://data//163手机//mirror//");
//		
//		traverse(extractor, new File("F://data//163手机//mirror//mobile.163.com//0011//product//0011000B//product"));
//		System.out.println(count);
//
//	}
	static int count = 0;
	
	public static void traverse(Extractor extractor, File path)throws Exception 
	{
		if (path == null) 
		{
			return;
		}

		if (path.isDirectory()) 
		{
			String[] files = path.list();
			for (int i = 0; i < files.length; i++) 
			{
				traverse(extractor, new File(path, files[i]));
			}
		} else 
		{
			if (path.getAbsolutePath().endsWith(".html")  
					&& path.getAbsolutePath().indexOf("_") == -1) 
     //String "path.getAbsolutePath()"= "F:/data/product.pconline.com.cn/product/934/p93443.html"	
			{
				System.out.println(path);
				count++;
				extractor.loadFile(path.getAbsolutePath());
				extractor.extract();
				
				System.out.println(count);
			}
		}
	}

}

基本的htmlparser用法还有很多，了解更多请点http://blog.csdn.net/caoxu1987728/category/395326.aspx

而对于StringUtils这个类，作者也没有进行详细的解释，我们下章再见……

caoxu1987728

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
2
评论
太平洋网页解析全程

package com.luceneheritrixbook.extractor.pconline.mobile;import java.io.BufferedWriter;import java.io.File;import java.io.FileWriter;import java.io.IOException;import java.util.Date;im
复制链接

扫一扫

专栏目录