Java 爬虫

Java 的一个网页爬虫 

现在觉得java 用起感觉真不如js py 这些脚本语言爽,写代码少,sublime 直接搞定!

package calcium.tools.grex;

import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class MyGrex {

	public static void main(String[] args) throws IOException {

		PrintWriter pw = new PrintWriter(new FileWriter("F:a.txt"));
		StringBuffer buf = new StringBuffer(1024 * 1024);
		String html = "F:a.html";
		readTxtFile(html, buf);
//		String html = "http://bbs.zhiyoo.com/forum.php?mod=modcp&action=thread&op=thread";
//		readHtml(html, buf);

		List<String> a = new ArrayList<String>();
		a = getLink(buf.toString());
		List<String> b = new ArrayList<String>();
		b = getHuifu(buf.toString());
		List<String> c = new ArrayList<String>();
		c = getDianji(buf.toString());
		getLinkl(buf.toString());

		// Pattern pattern = Pattern.compile("href=\"(.+?)\"");
		// Matcher matcher = pattern.matcher("<a href=\"index.html\">主页</a>");
		// if(matcher.find()) {
		// System.out.println(matcher.group(1));
		// }
		for (int i = 0; i < a.size(); i++) {
			System.out.println(a.get(i));
			pw.write(a.get(i));
			pw.write("V");
			pw.write(b.get(i));
			pw.write("V");
			pw.write(c.get(i));
			pw.write("\n");
		}
		pw.close();
	}

	public static List<String> getLink(String html) throws IOException {

		Pattern p = Pattern
				.compile("<a href=\"http://bbs.zhiyoo.com/forum.php"
						+ "\\?mod=viewthread&tid=[0-9]{7}\" target=\"_blank\"(| style=\"font-weight: bold;color: #8F2A90\")>"
						+ "(.*?)</a>");
		Matcher m = p.matcher(html);

		ArrayList<String> alist = new ArrayList<String>();
		while (m.find()) {
			alist.add(m.group(2));
			System.out.println(m.group(2));
		}
		return alist;
	}

	// 中文:[^x00-xff]*
	public static List<String> getLinkl(String html) throws IOException {

		Pattern p = Pattern
				.compile("<span id=\"thread_[0-9]{7}\"><a href=\"(.*?)\" target=\"_blank\"(| style=\"font-weight: bold;color: #8F2A90\")>[\\s\\S]</a><span>");
		Matcher m = p.matcher(html);

		ArrayList<String> alist = new ArrayList<String>();
		while (m.find()) {
			alist.add(m.group(1));
			System.out.println(m.group(1));
			System.out.println(m.group());

		}
		return alist;
	}

	public static List<String> getHuifu(String s) {
		// <span class="xi2">31</span><em>374</em><a.*?</a>
		String regex = "<span class=\"xi2\">(.+?)</span>";
		Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
		Matcher ma = pa.matcher(s);
		List<String> blist = new ArrayList<String>();
		while (ma.find()) {
			blist.add(ma.group(1));
			System.out.println(ma.group(1));
		}
		return blist;
	}

	public static List<String> getDianji(String s) {
		// <td class="num"><span class="xi2">25</span><em>504</em></td>
		String regex = "<td class=\"num\"><span class=\"xi2\">[0-9]{2}</span><em>(.+?)</em></td>";
		Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
		Matcher ma = pa.matcher(s);
		List<String> clist = new ArrayList<String>();
		while (ma.find()) {
			clist.add(ma.group(1));
			System.out.println(ma.group(1));
		}
		return clist;
	}

	public static void readTxtFile(String filePath, StringBuffer buf) {
		try {
			String encoding = "GBK";
			File file = new File(filePath);
			if (file.isFile() && file.exists()) { // �ж��ļ��Ƿ����
				InputStreamReader read = new InputStreamReader(
						new FileInputStream(file), encoding);// ���ǵ������ʽ
				BufferedReader bufferedReader = new BufferedReader(read);
				String lineTxt = null;
				while ((lineTxt = bufferedReader.readLine()) != null) {
					// System.out.println(lineTxt);
					buf.append(lineTxt);
				}
				read.close();
			} else {
				System.out.println("�Ҳ���ָ�����ļ�");
			}
		} catch (Exception e) {
			System.out.println("��ȡ�ļ����ݳ���");
			e.printStackTrace();
		}

	}

	public static void readHtml(String html,StringBuffer buf) {
		try {
			String encoding = "GBK";
			URL url = new URL(html);
			InputStreamReader read = new InputStreamReader(url.openStream(),encoding);
			BufferedReader bufferedReader = new BufferedReader(read);
			while (bufferedReader.readLine() != null) {
				String s = bufferedReader.readLine();
				buf.append(s);
				System.out.println(s);
			}
			read.close();
		} catch (Exception e) {
			// TODO: handle exception
			System.out.println("null");
		}
		
	}<strong>
}
</strong>


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值