分为两种形式的抓取:
1.基于首字母的四轮次抓取 如: a ,aa ,aaa,aaaa
2.基于汉语i拼音的三轮次抓取:附拼音表
链接:
http://pan.baidu.com/s/1eS5Kdmq 密码:n9pb
使用的框架为webmagic
淘宝爬取:
public class TaobaoPinyinSuggestWordPageProcessor implements PageProcessor{
// 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等
private Site site = Site.
me().
setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36ss").
setRetryTimes(5).setSleepTime(10);
@Override
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
public void process(Page page) {
//["童装","40827952"]
JSONArray a = JSON.parseArray(JSON.parseObject(page.getJson().toString()).get("result").toString());
for (Object aa : a) {
String replace = aa.toString().replace("[", "").replace("]", "").replace("\"", "");
String substring = replace.substring(0,replace.indexOf(","));
try {
IOUtils.write(substring.getBytes(), new FileOutputStream(new File(fileName),true));
IOUtils.write(IOUtils.LINE_SEPARATOR.getBytes(), new FileOutputStream(new File(fileName),true));
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
if(a.size()>9){
String url = page.getUrl().toString();
//https://suggest.taobao.com/sug?code=utf-8&q=
int index = "https://suggest.taobao.com/sug?code=utf-8&q=".length();
String query = url.substring(index, url.length());
char[] chars = query.toCharArray();
int num = 0;
for (char c : chars) {
if(c>=65 && c <= 90){
num++;
}
}
if(num <3){
List<String> temp = new ArrayList<String>();
for (String add : speeeds) {
temp.add(url+add);
}
page.addTargetRequests(temp);
}
}
}
@Override
public Site getSite() {
return site;
}
private static String fileName ;
private static List<String> speeeds;
public static void main(String[] args) throws IOException {
if(args.length>1 && args.length<3){
fileName = args[0];
speeeds = FileUtils.readLines(new File(args[1]));
}else{
fileName = &#