java爬取百度贴吧所有用户头像

最新推荐文章于 2019-07-09 17:47:22 发布

习惯~

最新推荐文章于 2019-07-09 17:47:22 发布

阅读量537

点赞数

分类专栏：爬虫文章标签：爬虫贴吧头像

本文链接：https://blog.csdn.net/yq714588944/article/details/85171657

版权

爬虫专栏收录该内容

2 篇文章 0 订阅

订阅专栏

第一次写爬虫，觉得蛮有意思的

写了个爬取贴吧用户头像的工具类

如有问题，多多指教~

爬虫工具类

package com.yq.spider;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.Closeable;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
 * ----爬虫工具类----
 * 用于爬取百度贴吧用户头像（爬取其他信息亦是如此）
 * 
 * @author 习惯~
 * 完成时间：2018/12/21 19:53:00
 * 欢迎交流：qq  714588944
 *   
 */
public class SpiderUtil {
	private static List<String> allURL = new ArrayList<>(); // 所有爬取的url集合
	private static Queue<String> all = new LinkedList<>();  // 准备爬取的url集合 (使用队列容器，先进先出，比较方便)

	/**
	 * 爬虫入口
	 * 
	 * @param urlStr
	 * @throws IOException
	 */
	public static void startSpider(String urlStr, String charset) {
		String content = getURLContent(urlStr, charset);
		String regexImg = "src=\"([\\w\\s/:.]+?)\""; // 正则表达式--匹配贴吧用户头像
		List<String> result = getMatcherContent(content, regexImg);
		download(result);

		// ======================搜索新页面=======================//
		String regexURL = "/home/main\\?un=([\\s\\S&=?]+?)\""; // 正则表达式--匹配贴吧用户主页
		List<String> resultURL = getMatcherContent(content, regexURL);
		// 拼装用户主页
		String rule = "http://tieba.baidu.com/home/main?un=";
		for (int i = 1; i < resultURL.size(); i++) { // i=1：过滤掉自己（访问者）
			String otherURL = rule + resultURL.get(i);
			// 过滤掉爬取过的用户主页
			if (allURL.contains(otherURL))
				continue;
			all.add(otherURL); // 加入到所有url集合中
			allURL.add(otherURL); // 加入到准备爬取的url集合中
		}
		System.out.println("正在爬取网页：" + all.peek());
		startSpider(all.poll(), charset); // 寻找新的宿主
	}
	
	/**
	 * 指定url网页字符集，并返回网页源代码
	 * 
	 * @param urlStr
	 * @param charset
	 * @return
	 * @throws IOException
	 */
	public static String getURLContent(String urlStr, String charset) {
		StringBuilder sb = new StringBuilder();
		BufferedReader br = null;
		try {
			URL url = new URL(urlStr);
			br = new BufferedReader(new InputStreamReader(url.openStream(), charset));
			String line = null;
			while ((line = br.readLine()) != null) {
				sb.append(line);
			}
		} catch (IOException e) {
			e.printStackTrace();
			closeAll(br);
		}
		return sb.toString();
	}

	/**
	 * 网页源代码通过匹配正则表达式获取符合条件的信息 并封装成list返回
	 * 
	 * @param content
	 * @param regex
	 * @return
	 */
	public static List<String> getMatcherContent(String content, String regex) {
		List<String> result = new ArrayList<>();
		Pattern p = Pattern.compile(regex);
		Matcher m = p.matcher(content);
		while (m.find()) {
			result.add(m.group(1));
		}
		return result;
	}

	/**
	 * 下载集合中的图片
	 * 
	 * @param result
	 * @throws IOException
	 */
	public static void download(List<String> result) {
		for (String temp : result) {
			if (!temp.startsWith("http")) // 测试中发现有个别抓取到的结果中前面没有带http协议，过滤掉
				continue;
			download(temp);
		}
	}

	/**
	 * 下载图片详细方法
	 * 
	 * @param imgURL
	 */
	public static void download(String imgURL) {
		// 测试中发现有个别抓取到的结果中imgURL等于下面一行的字符串，过滤掉
		if (imgURL.length() <= "http://tb.himg.baidu.com/sys/portrait/item/".length())
			return;
		BufferedInputStream bis = null;
		BufferedOutputStream bos = null;
		try {
			File dest = new File("d:/spider");// 图片下载在d盘下的spider文件夹下
			if (!dest.exists())
				dest.mkdirs();
			// 拼装图片名
			String imgName = imgURL.substring(imgURL.lastIndexOf("/") + 1) + ".jpg";
			dest = new File(dest, imgName); // 构建子文件夹
			// 开始下载图片到本地
			URL url = new URL(imgURL);
			bis = new BufferedInputStream(url.openStream());
			bos = new BufferedOutputStream(new FileOutputStream(dest));
			byte[] flush = new byte[1024];
			int len = 0;
			while ((len = bis.read(flush)) != -1) {
				bos.write(flush, 0, len);
			}
			bos.flush(); // 强制刷新
		} catch (IOException e) {
			e.printStackTrace();
			closeAll(bis, bos);
		}
	}

	/**
	 * 关闭流的方法
	 * 
	 * @param io
	 */
	public static void closeAll(Closeable... io) {
		for (Closeable temp : io) {
			try {
				if (temp != null)
					temp.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
	}
}

下面是测试类

package com.yq.spider;
/**
 * ---爬虫测试---
 * 
 * @author 习惯~
 * 完成时间：2018/12/21 19:53:00
 * 欢迎交流：qq  714588944
 *
 */
public class Test {
	public static void main(String[] args) {
		/**
		 * 用户需要提供：
		 *  	1.一个贴吧用户主页地址，
		 *  		如：http://tieba.baidu.com/home/main?un=Spurs呆呆天枰（网上随便找的，非商业用途，抱歉）
		 *  	2.url页面的字符集
		 */
		String url = "http://tieba.baidu.com/home/main?un=Spurs呆呆天枰";
		String charset = "utf8";
		
		/**
		 * 爬虫开始
		 * 需要手动停止
		 */
		SpiderUtil.startSpider(url, charset);
		
	}
}

下面是我的测试结果（控制台输出当前爬取网页）

已经爬取一万多个用户头像

爬取其他内容跟这个原理类似

习惯~

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
java爬取百度贴吧所有用户头像

第一次写爬虫，觉得蛮有意思的写了个爬取贴吧用户头像的工具类如有问题，多多指教~爬虫工具类package com.yq.spider;import java.io.BufferedInputStream;import java.io.BufferedOutputStream;import java.io.BufferedReader;import java.io.Clos...
复制链接

扫一扫

专栏目录