简单的网络爬虫，下载GitHub的头像

最新推荐文章于 2022-12-23 21:37:35 发布

dihaifu3941

最新推荐文章于 2022-12-23 21:37:35 发布

阅读量185

点赞数

文章标签：爬虫 java

原文链接：http://www.cnblogs.com/CUnote/p/5343204.html

版权

基于一个叫Web Magic的爬虫框架（https://github.com/code4craft/webmagic）去开发的，可以爬github的用户的头像到本地

使用Apache的HttpClient发送HttpRequest请求，用JSoup对下载来的html文档进行CSS选择器过滤找到合适的图片链接，再发送请求去下载图片并保存到本地。

  1 package webcrawler.webcrawler;
  2 import java.awt.im.InputContext;
  3 import java.io.ByteArrayInputStream;
  4 import java.io.File;
  5 import java.io.FileOutputStream;
  6 import java.io.IOException;
  7 import java.io.InputStream;
  8 import java.net.URL;
  9 import java.util.ArrayList;
 10 import java.util.Date;
 11 import java.util.HashMap;
 12 import java.util.HashSet;
 13 import java.util.Iterator;
 14 import java.util.List;
 15 import java.util.Map;
 16 import java.util.Set;
 17 
 18 import org.apache.http.HttpEntity;
 19 import org.apache.http.HttpResponse;
 20 import org.apache.http.client.ClientProtocolException;
 21 import org.apache.http.client.HttpClient;
 22 import org.apache.http.client.methods.HttpGet;
 23 import org.apache.http.client.utils.HttpClientUtils;
 24 import org.apache.http.impl.client.DefaultHttpClient;
 25 import org.apache.http.impl.client.HttpClientBuilder;
 26 
 27 import us.codecraft.webmagic.Page;
 28 import us.codecraft.webmagic.Site;
 29 import us.codecraft.webmagic.Spider;
 30 import us.codecraft.webmagic.pipeline.ConsolePipeline;
 31 import us.codecraft.webmagic.processor.PageProcessor;
 32 
 33 public class GithubAvaterDownLoaderProcessor implements PageProcessor {
 34 
 35     private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
 36     Set<String> globalSet= new HashSet();
 37     int index=1;
 38 
 39     public void process(Page page) {

　　　　　　　/*在主函数里，实例化Spider时，此GithubAvaterDownLoaderProcessor被构造的实例会最终赋到Spider实例里的PageProcessor对象。
　　　　　　　　这里的process方法最后被调用时，调用时page实例已得到*/

 40         List<String> listWithFollowers = new ArrayList<String>();
 41         for(String ori:page.getHtml().links().regex("(https://github\\.com/\\w+)").all()){
 42             listWithFollowers.add(ori+"/followers");
 43             listWithFollowers.add(ori+"/following");
 44         }
 45         
 46         page.addTargetRequests(listWithFollowers);
 47        
 48         MyPage myPage=new MyPage(page);
 49         page.putField("nameLinkMap", myPage.getMap());
 50 
 51     
 52         this.downLoadAavePicToLocal(page);  //发送下载请求并保存到本地
 53         globalSet.addAll(myPage.getMap().keySet()); //将访问过名字存入一个set，以后可以用来检查是否某用户已经访问过，存在该set里就可以跳过不去请求了
 54     }
 55 
 56 
 57     public void downloadSavePicToLocal(Page page) {
 58         HttpClient client= HttpClientBuilder.create().build();   
 59         Map map= page.getResultItems().get("nameLinkMap");
 60         
 61             Iterator<String> mapItor=map.keySet().iterator();
 62             while (mapItor.hasNext()) {   //迭代用户名为key，用户头像下载链接为value的map
 63                 String name=mapItor.next();
 64                 if(globalSet.contains(name)) //检测是否访问过
 65                     continue;
 66                 
 67                 String link =(String) map.get(name);
 68                 HttpGet getRequest= new HttpGet(link);
 69                 try {
 70                     HttpResponse response = client.execute(getRequest);
 71                     HttpEntity entity=response.getEntity();
 72                     InputStream is= entity.getContent();
 73                     File AvaterFolder= new File(".\\AvatersFolder");
 74                     if(!AvaterFolder.exists()) AvaterFolder.mkdirs();
 75                     
 76                     File file=new File(AvaterFolder+File.separator+index++ +" "+ name + ".jpg");
 77                     
 78                     FileOutputStream fileOutputStream= new FileOutputStream(file);
 79                     byte[] bytes= new byte[1024];
 80                     int length;
 81                     while((length=is.read(bytes,0,bytes.length))!=-1){
 82                         fileOutputStream.write(bytes, 0, length);
 83                     }
 84                     fileOutputStream.flush();
 85                     is.close();
 86                     fileOutputStream.close();
 87       
 88                 } catch (Throwable e) {
 89                     e.printStackTrace();
 90                 }
 91             }        
 92     }
 93     
 94     public Site getSite() {
 95         return site;
 96     }
 97     
 98     public static void main(String[] args) {
 99       Spider.create(new GithubAvaterDownLoaderProcessor())
100       　　.addPipeline(new MyConsolePipeline())
101       　　.addUrl("https://github.com/code4craft/webmagic/followers") //种子链接
102       　　.thread(2)
103       　　.run();
104     }
105 }

 1 package webcrawler.webcrawler;
 2 
 3 
 4 
 5 import java.util.ArrayList;
 6 import java.util.HashMap;
 7 import java.util.List;
 8 import java.util.Map;
 9 import java.util.regex.Matcher;
10 import java.util.regex.Pattern;
11 
12 import javax.management.remote.SubjectDelegationPermission;
13 
14 import org.jsoup.Jsoup;
15 import org.jsoup.nodes.Document;
16 import org.jsoup.nodes.Element;
17 
18 import us.codecraft.webmagic.Page;
19 import us.codecraft.webmagic.selector.Html;
20 import us.codecraft.webmagic.utils.UrlUtils;
21 
22 public class MyPage{
23     private String rString;
24     private Map<String,String> map;
25     
26     public Map getMap(){
27         return this.map;   //用来返回此map，用户名为key，用户头像链接为value
28     }
29     
30     public MyPage(Page page) {
31         rString= page.getRawText();
32         map = new HashMap();
33         Document document=Jsoup.parse(rString);   //用JSoup去parse页面
34         List<Element> listOfElements = document.select("img.gravatar");  //CSS selector 去定位元素
35         for(Element element:listOfElements){
36             map.put((String)element.attr("alt"), getCleanOne((String)element.attr("src")));  //保存到map， 用户名（被定位元素的alt属性值）为key，该用户头像链接为value
37         }
38     }
39     public String getCleanOne(String s) {
40         Pattern pattern = Pattern.compile("https://avatars\\d\\.githubusercontent\\.com/u/\\d+");  //只寻找符合此条件的链接
41         Matcher matcher=pattern.matcher(s);
42         if(matcher.find()) 
43             return matcher.group();
44         return null;
45     }
46     
47 }