Java 实现简单网页小爬虫程序

最新推荐文章于 2024-07-04 16:04:39 发布

etfox

最新推荐文章于 2024-07-04 16:04:39 发布

阅读量2.8k

点赞数 4

分类专栏： Java XML 文章标签： dom4j xml java 正则表达式

本文链接：https://blog.csdn.net/qq_29689487/article/details/51217603

版权

Java 同时被 2 个专栏收录

24 篇文章 0 订阅

订阅专栏

XML

1 篇文章 0 订阅

订阅专栏

使用正则表达式实现简单的网页爬虫程序：

相关 jar 包：dom4j-1.6.1.jar jaxen-1.1-beta-6.jar

Java 源码：

package com.v7.netdpider;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Iterator;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 
 * @ClassName: NetSpider
 * @Description: TODO 网页爬虫(蜘蛛)
 * @author: V7
 * @date: 2016年4月21日 下午3:34:53
 */
class NetSpider {
	
	public static void main(String[] args) throws Exception {
		Calendar date = Calendar.getInstance(); 

        String uri = "D:" + File.separator + "mail" + File.separator + date.get(Calendar.YEAR) + File.separator + "0" +(date.get(Calendar.MONTH)+1) + File.separator  + date.get(Calendar.DAY_OF_MONTH); 

        //String uri = "D:" + File.separator + "mail" + File.separator + "Email_0.txt";
        // 使用 file类找到一个文件
        File file = new File(uri); 
        
        //System.out.println(file.getAbsolutePath()); 
        //生成 xml
        
		if (!file.isDirectory()) {
			// 文件夹不存在
			System.out.println("指定文件路径不存在，创建文件路径并创建文件：Email_0.txt");
			isnanfile(file);
		} else {
			// 判断文件是否存在
			/*if (file.exists()) {
				
				System.out.println("首文件已存在，创建新的文件");
				newisnanfile(file);
				System.out.println("开始爬取...");
				getMails(file);
			} else {
				System.out.println("首文件不存在，直接写入");
				getMails(file);
			}*/
			//文件夹存在
			System.out.println("指定文件路径存在");
			newisnanfile(file);
		}
	}

	/*
	 * 获取文档中的邮件地址 使用获取功能。Pattern Matcher
	 */
	public static void getMail() throws Exception {
		@SuppressWarnings("resource")
		BufferedReader bufr = new BufferedReader(new FileReader("d:\\mail.txt"));

		String line = null;
		// 规则
		String regex = "[w]{3}\\.\\w+@[a-zA-Z0-9]+(\\.[a-zA-Z]{3}+)";
		Pattern p = Pattern.compile(regex);

		while ((line = bufr.readLine()) != null) {
			Matcher m = p.matcher(line);
			while (m.find()) {
				System.out.println(m.group());
			}
		}
	}

	/*
	 * 爬网页
	 */
	public static void getMails(File file) throws Exception {
		@SuppressWarnings("resource")
		Scanner input = new Scanner(System.in);
		System.out.print("请输入要爬取的网页完整URL:");
		String uri = input.next();
		// URL url = new URL("http://127.0.0.1:8080/webs/res.html");
		URL url = new URL(uri);
		URLConnection conn = url.openConnection();

		BufferedReader buf = new BufferedReader(new InputStreamReader(conn.getInputStream()));

		String line = null;
		// 规则
		// String regex ="[w]{3}\\.\\w+@[a-zA-Z0-9]+(\\.[a-zA-Z]{3}+)";
		String regex = "\\w+@[a-zA-Z0-9]+(\\.[a-zA-Z]{3}+)";
		Pattern p = Pattern.compile(regex);

		while ((line = buf.readLine()) != null) {
			Matcher m = p.matcher(line);
			while (m.find()) {
				// System.out.println(m.group());
		        
				NetSpiderXml.mkNetSpiderXML(m.group());//写入xml
		        
				BufferedWriter out = null;
				try {
					out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, true)));
					out.write(m.group());
					out.newLine();// 追加换行
				} catch (IOException e) {
					e.printStackTrace();
				} finally {
					try {
						if (out != null) {
							out.close();
						}
					} catch (IOException e) {
						e.printStackTrace();
					}
				}
			}
		}
		System.out.println("网页爬虫，获取邮箱地址结束！");
		System.out.println("结果返回路径：" + file.getParent());
	}

	/**
	 * 
	 * @Title: newisnanfile
	 * @Description: TODO 指定文件已存在，创建新的文件，防止覆盖
	 * @param file
	 *            文件路径, name 上一个文件名
	 *            	max_val
	 *            		最大值
	 * @throws Exception
	 * @return: void
	 */
	public static void newisnanfile(File file) throws Exception {
		int max_val = findFileList(file);//首文件不存在，就返回 0 ，创建首文件
		// 参数1：文件所在路径 参数2：文件名称.后缀名
		File file2 = new File(file, "Email_"+ max_val +".txt");
		
		try {
			file2.createNewFile(); // 创建一个空的文件
			System.out.println("新建文件成功");
			System.out.println("开始爬取...");
			getMails(file2);
		} catch (Exception e) {
			throw e;
		}
	}

	/**
	 * 
	 * @Title: findFileList
	 * @Description: TODO 查询当前目录下的所有文件夹以及文件的名称
	 * @param file
	 *            文件名
	 * @return: void
	 */
	@SuppressWarnings("unchecked")
	public static int findFileList(File file) {
		int max = 0;
		try {
			@SuppressWarnings("rawtypes")
			ArrayList list = new ArrayList();
			File flist = new File(file.getPath());//文件夹绝对URL
			File[] filel = flist.listFiles();
			//System.out.println("===>"+filel.length);
			if(filel.length==0) //当不存在文件时，直接返回0
				return max;
			for (int i = 0; i < filel.length; i++) {
				File fs = filel[i];//获得此路径下的所有文件
				String str = fs.getName();//获取文件名后面的数字
				//Email_5.txt
				//规则
				String resl = "\\_(.*)\\.";

				//将规则封装成对象
				Pattern compile = Pattern.compile(resl);
					
				//让正则对象和要作用的字符串关联起来
				Matcher matcher = compile.matcher(str);
						
				//boolean flag = matcher.find();//讲规则作用到字符串上并进行符合规则的子串查找
				while(matcher.find())
				{
					//System.out.println(matcher.group(1)); //group() 获取匹配后结果
					list.add(matcher.group(1));//存入数组中
				}
			}
			@SuppressWarnings("rawtypes")
			Iterator ita = list.iterator();
			while(ita.hasNext()){
				Object next = ita.next();
				//System.out.println(ita.next());
				//int panum = Integer.parseInt(String.valueOf(next));
				String str = next.toString();
				int parseInt = Integer.parseInt(str);
				//System.out.println(parseInt);
				if(max<parseInt){
					max = parseInt;//获得最大值
				}
			}
		} catch (NumberFormatException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return max+1;
	}

	/**
	 * 
	 * @Title: isnanfile
	 * @Description: TODO 当文件不存在时，创建一个空文件，并且调用一些 file 相关的方法
	 * @param file
	 *            文件名
	 * @throws Exception
	 * @return: void
	 */
	public static void isnanfile(File file) throws Exception {
		
		// 参数1：文件所在路径 参数2：文件名称.后缀名
		File file2 = new File(file, "Email_0.txt");

		/*
		 * D:\mail String[] splitstr = pPath.split("(?<=\\\\)[^\\\\]+(?=\\\\)");
		 * String[] splitstr = pPath.split("([^\\\\]{1}[a-zA-Z0-9.][$\\\\]*)");
		 * String[] splitstr = pPath.split("[\\\\]"); for(String files :
		 * splitstr){
		 * 
		 * File file = new File("D:\\A\\B\\C"); file.mkdirs(); file = new
		 * File("D:\\A\\B\\D"); 
		 * file.mkdir();
		 * 
		 * System.out.println(files); file.mkdirs(); }
		 */

		try {
			file.mkdirs();// 创建路径
			file2.createNewFile(); // 创建一个空的文件
			System.out.println("就绪...");
			System.out.println("开始爬取...");
			getMails(file2);
		} catch (Exception e) {
			throw e;
		}
	}
}

输出 xml 源码：

package com.v7.netdpider;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.List;

import org.dom4j.Document;
import org.dom4j.Element;
import org.dom4j.Node;
import org.dom4j.io.OutputFormat;
import org.dom4j.io.SAXReader;
import org.dom4j.io.XMLWriter;
import org.junit.Test;

public class NetSpiderXml {
	private static int i = 0;
	//private static int j = 0;
	@Test
	public static void mkNetSpiderXML(String val) throws Exception {
		SAXReader sax = new SAXReader();// 创建一个SAXReader对象
		File xmlFile = new File("src/Email.xml");// 根据指定的路径创建file对象
		Document document = sax.read(xmlFile);// 获取document对象,如果文档无节点，则会抛出Exception提前结束
		if(i==0){
			i++;
			delNode(document, "email"); // 根据 name 节点，删除
			saveDocument(document, xmlFile);// 把改变的内存中的document真正保存到指定的文件中
		}
		 Element root = document.getRootElement();// 获取根节点
		 addNode(root,"email", val);// 对指定的节点新增名为newNode的子节点，并指定新增字节的内容 
		 saveDocument(document, xmlFile);// 把改变的内存中的document真正保存到指定的文件中
		/*editAttribute(root,"email");// 对指定名字的节点进行属性的添加删除修改
		saveDocument(document, xmlFile);// 把改变的内存中的document真正保存到指定的文件中
*/	}

	/**
	 * 
	 * @Title: addNode
	 * @Description: TODO 对指定的节点添加子节点和对象的文本内容
	 * @param node
	 * @param nodeName
	 * @param content
	 *            文本内容
	 * @return: void
	 */
	public static void addNode(Element node, String nodeName, String content) {
		Element newNode = node.addElement(nodeName);// 对指定的节点node新增子节点,名为nodeName
		newNode.setText(content);// 对新增的节点添加文本内容content
	}

	/**
	 * 
	 * @Title: saveDocument
	 * @Description: TODO 把改变的domcument对象保存到指定的xml文件中
	 * @param document
	 * @param xmlFile
	 * @throws IOException
	 * @return: void
	 */
	public static void saveDocument(Document document, File xmlFile)
			throws IOException {
		Writer osWrite = new OutputStreamWriter(new FileOutputStream(xmlFile));// 创建输出流
		OutputFormat format = OutputFormat.createPrettyPrint(); // 获取输出的指定格式
		format.setEncoding("GB2312");// 设置编码 ，确保解析的xml为UTF-8格式
		XMLWriter writer = new XMLWriter(osWrite, format);// XMLWriter
															// 指定输出文件以及格式
		writer.write(document);// 把document写入xmlFile指定的文件(可以为被解析的文件或者新创建的文件)
		writer.flush();
		writer.close();
	}

	/**
	 * 
	 * @Title: editAttribute
	 * @Description: TODO 对指定的节点属性进行删除、添加、修改
	 * @param root
	 * @param nodeName
	 * @return: void
	 */
	public static void editAttribute(Element root, String nodeName) {
		// 获取指定名字的节点，无此节点的会报NullPointerException,时间问题不做此情况的判断与处理了
		Element node = root.element("email");
		// Attribute attr = node.attribute("email");//
		// 获取此节点指定的属性,无此节点的会报NullPointerException
		// node.remove(attr);// 删除此属性
		// Attribute attrDate = node.attribute("date");// 获取此节点的指定属性
		// attrDate.setValue("更改");// 更改此属性值
		node.addAttribute("id", "email");// 添加的属性
	}

	/**
	 * 
	 * @Title: deleteStudentByName
	 * @Description: TODO del 指定的节点
	 * @param document
	 * @param name
	 *            子节点名字
	 * @return: void
	 */
	public static void delNode(Document document, String name) {

		try {
			@SuppressWarnings("unchecked")
			List<Node> list = document.selectNodes("//emailroot/email");
			//System.out.println(list.size());
			for (Node node : list) {
				if (node instanceof Element) {
					Element ele = (Element) node;
					//System.out.println(ele.getName());
					//document.remove(ele);
					if (ele.getName().equals(name)) {
						Element examNode = ele.getParent();
						examNode.remove(ele);
					}
				}
			}
		} catch (Exception e) {

			e.printStackTrace();

		}

	}
}

Email.xml :

<?xml version="1.0" encoding="GB2312"?>

<emailroot> 
  
</emailroot>

etfox

关注

4
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
Java 实现简单网页小爬虫程序

使用正则表达式实现简单的网页爬虫程序：相关 jar 包：dom4j-1.6.1.jar jaxen-1.1-beta-6.jar Java 源码：package com.v7.netdpider;import java.io.BufferedReader;import java.io.BufferedWriter;import java.i
复制链接

扫一扫

专栏目录