第一只爬虫

一直对爬虫很感兴趣,四个月前就想有一个能够批量获取单词的发音的爬虫。前两天学校里有一个爬虫培训课,听课后就写了这么一个小小的爬虫。

功能:将指定一批单词的发音下载到本地文件系统中。

下载到指定文件夹中:

程序如下: 


import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 第一只爬虫
 * 爬取指定单词的发音
 * @author imyLove7
 *
 */
public class Demo2 {
	public static void main(String[] args) {
		try {
			//正则表达式字符串
			String str;
			String line;
			//读取文件的当前行
			String currentLine = "";
			//服务器数据源
			String source;
			//目标链接头部
			String urlHead;
			InputStream is;
			//目标文件链接
			String targetUrl;
			//目标文件名称
			String targetName;
			//用于区分英音、美音
			boolean flag = true;
			StringBuilder builder;
			
			urlHead= "";
			BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream("E:\\word.txt"),"UTF-8"));
			currentLine = reader.readLine();
			while((currentLine = reader.readLine()) != null) {
				//拼接目标URL
				targetUrl = urlHead + currentLine;
				System.out.println("当前:" + currentLine);
				//创建一个URL对象
				URL url = new URL(targetUrl);
				//打开连接
				HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection();
				//建立与远端服务器的连接
				urlConnection.connect();
				System.out.println("连接成功!");
				//获取服务器的数据流
				is = urlConnection.getInputStream();
				//连接服务器得到的数据是通过字节流获取的,需要将字节流转换成字符流才能读清服务器的数据
				//字符缓冲流的创建依赖于字符流,字符流的创建依赖于字节流
				//将字节流转换成字符缓冲流
				BufferedReader reader2 = new BufferedReader(new InputStreamReader(is));
				//创建缓冲字符串,用于拼接读取到的所有数据
				builder = new StringBuilder();
				line = null;
				while((line = reader2.readLine()) != null) {
					builder.append(line);
				}
				reader2.close();
				source = builder.toString();
				//构造正则表达式
				str = "";
				Pattern pattern = Pattern.compile(str);
				Matcher matcher = pattern.matcher(source);
				while(matcher.find()) {
					//获取对应的数据
					String mp3 = matcher.group();
					//构造目标文件名
					if(flag) {
						System.out.println("US");
						flag = false;
						targetName = currentLine + "_" + "ame";
					}
					else {
						System.out.println("UK");
						flag = true;
						targetName = currentLine + "_" + "bre";
					}
					//截取mp3目标链接
					String mp3Url = mp3.substring(mp3.indexOf(""), mp3.indexOf(""));
					System.out.println(mp3Url);
					DownloadUtils downloadUtils = new DownloadUtils(mp3Url, targetName, "mp3", "E:\\word\\22");
					try {
						downloadUtils.httpDownload();
					} catch (Exception e) {
						e.printStackTrace();
					}
				}
			}
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}	finally {
			System.out.println("任务执行完毕!");
		}
	}
}
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;

/**
 * 下载工具类
 * @author imyLove7
 *
 */ 
public class DownloadUtils {
	
	private String downloadPath = null;
	private String targetName = null;
	private String targetType = null;
	private File savedPath = null;
	
	public DownloadUtils() {
		super();
	}
	
	public DownloadUtils(String downloadPath, String targetName, String targetType, String savedPath) {
		super();
		this.downloadPath = downloadPath;
		this.targetName = targetName;
		this.targetType = targetType;
		this.savedPath = new File(savedPath);
	}
	
	public DownloadUtils(String downloadPath, String targetName, String targetType, File savedPath) {
		super();
		this.downloadPath = downloadPath;
		this.targetName = targetName;
		this.targetType = targetType;
		this.savedPath = savedPath;
	}
	
	public String getDownloadPath() {
		return downloadPath;
	}

	public void setDownloadPath(String downloadPath) {
		this.downloadPath = downloadPath;
	}

	public String getTargetName() {
		return targetName;
	}

	public void setTargetName(String targetName) {
		this.targetName = targetName;
	}

	public String getTargetType() {
		return targetType;
	}

	public void setTargetType(String targetType) {
		this.targetType = targetType;
	}

	public File getSavedPath() {
		return savedPath;
	}

	public void setSavedPath(File savedPath) {
		this.savedPath = savedPath;
	}

	public void httpDownload() throws Exception {
		//非法判断
		if (downloadPath == null) {
			throw new Exception("非法!下载路径为空!");
		}
		if (savedPath == null || !savedPath.exists() || !savedPath.isDirectory()) {
			throw new Exception("非法!保存路径为空或者不存在!");
		}
		
		HttpURLConnection urlConnection;
		urlConnection = (HttpURLConnection) new URL(downloadPath).openConnection();
		urlConnection.connect();
		InputStream inputStream = urlConnection.getInputStream();
		File file = new File(savedPath, targetName + "." + targetType);
		if (!file.exists()) {
			file.createNewFile();
		}
		FileOutputStream fileOutputStream = new FileOutputStream(file, true);
		int temp;
		while ((temp = inputStream.read()) != -1) {
			fileOutputStream.write(temp);
			fileOutputStream.flush();
		}
		fileOutputStream.close();
		inputStream.close();
	}
}

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值