最近想搞一个英语词典,市面上的词典,要么广告太多且收费,要么不能成套成体系的提供各阶段的单词。于是写了个爬虫,爬取了市面上的一些单词,此记录主要为爬取单词音标方面,特此记录,以便后续查询 (ps:大家在写爬虫时尽量使用python,总感觉java写的爬虫效率比较低,且非常消耗内存。怪怪的)
1.组装url,接口地址+单词
//有道api 美式:type=0 英式:type=1
String baseUrl = "http://dict.youdao.com/dictvoice?type=1&audio=";
//获取要爬取的单词列表
List<String> wordList = lianCiZhenti5500Mapper.findVcVocabulary();
//处理后的单词
String word = null;
//源单词
String word_o =null;
String wordUrl =null;
for (int i=0;i<wordList.size();i++){
word_o = wordList.get(i);
//犹豫有些词组中间会有空格,在请求到url时不能被识别,所以词组中的空格进行转移,%20代表空格
word = word_o.replaceAll(" ", "%20");
//单词的url为,接口地址加单词
wordUrl = baseUrl+ word;
//调用http方法
DownloadUtils downloadUtils = new DownloadUtils(wordUrl, word_o, "mp3","H:\\wordMp3\\words");
try {
downloadUtils.httpDownload();
System.out.print("\t \t \t下载成功");
} catch (Exception e) {
System.out.print("\t \t \t下载失败");
e.printStackTrace();
}
}
2. 使用http下载
//通过http现在所需要的音标mp3格斯
public class DownloadUtils {
// 目标链接字符串
private String wordUrl;
// 单词字符串
private String wordString;
// 目标文件的格式
private String targetType;
// 存放文件路径
private File rootDir;
public DownloadUtils(String wordUrl, String wordString, String targetType, File rootDir) {
super();
this.wordUrl = wordUrl;
this.wordString = wordString;
this.targetType = targetType;
this.rootDir = rootDir;
}
public DownloadUtils(String wordUrl, String wordString, String targetType, String rootDir) {
super();
this.wordUrl = wordUrl;
this.wordString = wordString;
this.targetType = targetType;
this.rootDir = new File(rootDir);
}
public DownloadUtils() {
super();
}
/**
* 开始下载
*
* @throws Exception
*/
public void httpDownload() throws Exception {
validate();
final String urls = wordUrl;
HttpURLConnection urlConnection;
urlConnection = (HttpURLConnection) new URL(urls)
.openConnection();
// 开启链接
urlConnection.connect();
InputStream inputStream = urlConnection.getInputStream();
if(!rootDir.exists()){
rootDir.mkdirs();
}
File temp = new File(rootDir,
wordString + "." + targetType);
//if (!temp.exists()) {
temp.createNewFile();
//}
FileOutputStream fileOutputStream = new FileOutputStream(temp, true);
int tem;
while (-1 != (tem = inputStream.read())) {
fileOutputStream.write(tem);
fileOutputStream.flush();
}
fileOutputStream.close();
inputStream.close();
}
private void validate() throws Exception {
if (wordUrl == null || wordUrl.equals("")) {
throw new Exception("下载路径不能为空!");
}
if (null == rootDir ) {
throw new Exception("目标文件夹不存在!");
}
}
}
3.附上url编码表