在Google这个由10的100次方得名的站点中,各种评估网站的算法层出不穷,而PageRank即是其中之一。
Google的PageRank根据网站的外部链接和内部链接的数量和质量俩衡量网站的价值。PageRank背后的概念是,每个到页面的链接都是对该页面的一次投票,被链接的越多,就意味着被其他网站投票越多。这个就是所谓的“链接流行度”——衡量多少人愿意将他们的网站和你的网站挂钩。PageRank这个概念引自学术中一篇论文的被引述的频度——即被别人引述的次数越多,一般判断这篇论文的权威性就越高。
通常情况下讲,原创内容越多的站点,PageRank越容易提升,反之则相对比较困难,PageRank最大上限值为10。在Google的评估中,能上10的网站真可谓凤毛麟角,即使算上Google,能成就PageRank 10这“伟业”者,望眼环球也不足40家。一般来说,个人站点评估值4即办的不错,商业网站到6以上便算步入正轨了。
网上虽然有不少现成的查询器及源码,但是光用别人的毕竟不符合程序员风格,所以今天自己用Java重造轮子又写了个PageRank查询实现,捎带着把一些常用搜索引擎的网站链接及反向链接查询也加上了。
源码如下:
GooglePageRank.java
- packageorg.loon.test;
- importjava.io.IOException;
- importjava.util.Random;
- importjava.util.regex.Matcher;
- importjava.util.regex.Pattern;
- /**
- *Copyright2008
- *
- *LicensedundertheApacheLicense,Version2.0(the"License");youmaynot
- *usethisfileexceptincompliancewiththeLicense.Youmayobtainacopyof
- *theLicenseat
- *
- *http://www.apache.org/licenses/LICENSE-2.0
- *
- *Unlessrequiredbyapplicablelaworagreedtoinwriting,software
- *distributedundertheLicenseisdistributedonan"ASIS"BASIS,WITHOUT
- *WARRANTIESORCONDITIONSOFANYKIND,eitherexpressorimplied.Seethe
- *Licenseforthespecificlanguagegoverningpermissionsandlimitationsunder
- *theLicense.
- *
- *@projectloonframework
- *@authorchenpeng
- *@email:ceponline@yahoo.com.cn
- *@version0.1
- */
- publicclassGooglePageRank{
- //googlepagerank服务器ip地址列表(最近google小气了很多,反复查询一个封ip)
- finalstaticString[]GoogleServiceIP=newString[]{"64.233.161.100",
- "64.233.161.101","64.233.183.91","64.233.189.44","66.102.1.103",
- "66.102.9.115","66.249.89.83","66.249.91.99","66.249.93.190"};
- //google用识别标记
- finalstaticprivateintGOOGLE_MAGIC=0xE6359A60;
- //ch数值混合器
- privateclassCHMix{
- inta;
- intb;
- intc;
- publicCHMix(){
- this(0,0,0);
- }
- publicCHMix(inta,intb,intc){
- this.a=a;
- this.b=b;
- this.c=c;
- }
- }
- /**
- *按google要求混合成ch数据
- *
- *@parammix
- */
- privatestaticvoidmix(finalCHMixmix){
- mix.a-=mix.b;
- mix.a-=mix.c;
- mix.a^=mix.c>>13;
- mix.b-=mix.c;
- mix.b-=mix.a;
- mix.b^=mix.a<<8;
- mix.c-=mix.a;
- mix.c-=mix.b;
- mix.c^=mix.b>>13;
- mix.a-=mix.b;
- mix.a-=mix.c;
- mix.a^=mix.c>>12;
- mix.b-=mix.c;
- mix.b-=mix.a;
- mix.b^=mix.a<<16;
- mix.c-=mix.a;
- mix.c-=mix.b;
- mix.c^=mix.b>>5;
- mix.a-=mix.b;
- mix.a-=mix.c;
- mix.a^=mix.c>>3;
- mix.b-=mix.c;
- mix.b-=mix.a;
- mix.b^=mix.a<<10;
- mix.c-=mix.a;
- mix.c-=mix.b;
- mix.c^=mix.b>>15;
- }
- /**
- *获得ch数值混合器
- *
- *@return
- */
- publicstaticCHMixgetInnerCHMix(){
- returnnewGooglePageRank().newCHMix();
- }
- /**
- *通过url获得googlech(google数据库针对页面的全球唯一标识)
- *
- *@paramurl
- *@return
- */
- publicstaticStringGoogleCH(finalStringurl){
- //格式化为google要求的info:url模式
- StringnUrl=String.format("info:%s",newObject[]{url});
- //获得新url字符串格式
- char[]urls=nUrl.toCharArray();
- //获得新url长度
- intlength=urls.length;
- //获得一个ch数值混合器
- CHMixchMix=GooglePageRank.getInnerCHMix();
- //为c注入google识别标识
- chMix.c=GOOGLE_MAGIC;
- //为a、b项注入google要求的初始标识
- chMix.a=chMix.b=0x9E3779B9;
- intk=0;
- intlen=length;
- while(len>=12){
- chMix.a+=(int)(urls[k+0]+(urls[k+1]<<8)
- +(urls[k+2]<<16)+(urls[k+3]<<24));
- chMix.b+=(int)(urls[k+4]+(urls[k+5]<<8)
- +(urls[k+6]<<16)+(urls[k+7]<<24));
- chMix.c+=(int)(urls[k+8]+(urls[k+9]<<8)
- +(urls[k+10]<<16)+(urls[k+11]<<24));
- //获得混合运算后的数据
- GooglePageRank.mix(chMix);
- k+=12;
- len-=12;
- }
- chMix.c+=length;
- //产生googlech的11位标识
- switch(len){
- case11:
- chMix.c+=(int)(urls[k+10]<<24);
- case10:
- chMix.c+=(int)(urls[k+9]<<16);
- case9:
- chMix.c+=(int)(urls[k+8]<<8);
- case8:
- chMix.b+=(int)(urls[k+7]<<24);
- case7:
- chMix.b+=(int)(urls[k+6]<<16);
- case6:
- chMix.b+=(int)(urls[k+5]<<8);
- case5:
- chMix.b+=(int)(urls[k+4]);
- case4:
- chMix.a+=(int)(urls[k+3]<<24);
- case3:
- chMix.a+=(int)(urls[k+2]<<16);
- case2:
- chMix.a+=(int)(urls[k+1]<<8);
- case1:
- chMix.a+=(int)(urls[k+0]);
- break;
- default:
- break;
- }
- //获得混合运算后的数据
- GooglePageRank.mix(chMix);
- //获得未修订的CH
- Stringtch=String.valueOf(chMix.c);
- //矫正差值后反馈正确CH
- returnString
- .format("6%s",newObject[]{tch.length()<10?("-"+tch)
- .intern():tch});
- }
- /**
- *正则匹配pagerank结果
- *
- *@paramvalue
- *@return
- */
- privatestaticStringMatchRank(finalStringvalue){
- Patternpattern=Pattern.compile("Rank_1:[0-9]:([0-9]+)");
- Matchermatcher=pattern.matcher(value);
- if(matcher.find()){
- returnmatcher.group(1);
- }
- return"0";
- }
- /**
- *获得指定页面的googlepagerank值
- *
- *@paramurl
- *@return
- */
- publicstaticStringGooglePR(finalStringurl){
- Stringrip=GoogleServiceIP[newRandom()
- .nextInt(GoogleServiceIP.length)];
- returnGooglePR(url,rip);
- }
- /**
- *以指定的google服务器获得指定页面的googlepagerank值
- *
- *@paramurl
- *@paramip
- *@return
- */
- publicstaticStringGooglePR(finalStringurl,finalStringip){
- //产生查询用唯一标识
- Stringchecksum=GoogleCH(url);
- //产生查询用url
- StringqueryUrl=String
- .format(
- "http://%s/search?client=navclient-auto&ch=%s&features=Rank&q=info:%s",
- newObject[]{ip,checksum,url});
- Stringresponse;
- try{
- response=SimpleWebClient.getRequestHttp(queryUrl);
- }catch(IOExceptione){
- response="";
- }
- if(response.length()==0){
- return"0";
- }else{
- returnGooglePageRank.MatchRank(response);
- }
- }
- }
SimpleWebClient.java
- packageorg.loon.test;
- importjava.io.BufferedInputStream;
- importjava.io.ByteArrayOutputStream;
- importjava.io.IOException;
- importjava.io.InputStream;
- importjava.io.InputStreamReader;
- importjava.io.OutputStreamWriter;
- importjava.net.HttpURLConnection;
- importjava.net.URL;
- importjava.util.HashMap;
- importjava.util.Iterator;
- importjava.util.Map;
- importjava.util.Set;
- importjava.util.Map.Entry;
- importsun.misc.BASE64Encoder;
- /**
- *Copyright2008
- *
- *LicensedundertheApacheLicense,Version2.0(the"License");youmaynot
- *usethisfileexceptincompliancewiththeLicense.Youmayobtainacopyof
- *theLicenseat
- *
- *http://www.apache.org/licenses/LICENSE-2.0
- *
- *Unlessrequiredbyapplicablelaworagreedtoinwriting,software
- *distributedundertheLicenseisdistributedonan"ASIS"BASIS,WITHOUT
- *WARRANTIESORCONDITIONSOFANYKIND,eitherexpressorimplied.Seethe
- *Licenseforthespecificlanguagegoverningpermissionsandlimitationsunder
- *theLicense.
- *
- *@projectloonframework
- *@authorchenpeng
- *@email:ceponline@yahoo.com.cn
- *@version0.1
- */
- publicclassSimpleWebClient{
- /**
- *向指定url发送请求并获得响应数据
- *
- *@paramurlString
- *@return
- *@throwsIOException
- */
- publicstaticStringgetRequestHttp(StringurlString)throwsIOException{
- returngetRequestHttp(urlString,"utf-8");
- }
- /**
- *向指定url发送请求并获得响应数据
- *
- *@paramurlString
- *@paramencoding
- *@return
- *@throwsIOException
- */
- publicstaticStringgetRequestHttp(StringurlString,Stringencoding)
- throwsIOException{
- returngetRequestHttp(urlString,encoding,null,5000);
- }
- /**
- *向指定url发送请求并获得响应数据
- *
- *@paramurlString
- *@paramencoding
- *@paramparameter
- *@return
- *@throwsIOException
- */
- publicstaticStringgetRequestHttp(finalStringurlString,
- finalStringencoding,finalMapparameter,finalinttimeout)
- throwsIOException{
- StringnURL=(urlString.startsWith("http://")||urlString
- .startsWith("https://"))?urlString:("http:"+urlString)
- .intern();
- Stringuser=null;
- Stringpassword=null;
- Stringmethod="GET";
- Stringpost=null;
- Stringdigest=null;
- StringresponseContent="ERROR";
- booleanfoundRedirect=false;
- Mapheaders=newHashMap();
- if(parameter!=null){
- SetentrySet=parameter.entrySet();
- for(Iteratorit=entrySet.iterator();it.hasNext();){
- Entryheader=(Entry)it.next();
- Stringkey=(String)header.getKey();
- Stringvalue=(String)header.getValue();
- if("user".equals(key)){
- user=value;
- }elseif("pass".equals(key)){
- password=value;
- }elseif("method".equals(key)){
- method=value;
- }elseif("post".equals(key)){
- post=value;
- }else{
- headers.put(key,value);
- }
- }
- }
- URLurl=newURL(nURL);
- if(user!=null&&password!=null){
- BASE64Encoderbase64=newBASE64Encoder();
- digest="Basic"
- +base64.encode((user+":"+password).getBytes());
- }
- do{
- HttpURLConnectionurlConnection=(HttpURLConnection)url
- .openConnection();
- //添加访问授权
- if(digest!=null){
- urlConnection.setRequestProperty("Authorization",digest);
- }
- urlConnection.setDoOutput(true);
- urlConnection.setDoInput(true);
- urlConnection.setUseCaches(false);
- urlConnection.setInstanceFollowRedirects(false);
- urlConnection.setRequestMethod(method);
- if(timeout>0){
- urlConnection.setConnectTimeout(timeout);
- }
- //模拟http头文件
- urlConnection.setRequestProperty("User-Agent","Mozilla/4.0(compatible;MSIE7.0;)");
- urlConnection.setRequestProperty("Accept","image/gif,image/x-xbitmap,image/jpeg,image/pjpeg,application/x-shockwave-flash,application/msword,application/vnd.ms-excel,application/vnd.ms-powerpoint,*/*");
- //追加http头文件
- SetheadersSet=headers.entrySet();
- for(Iteratorit=headersSet.iterator();it.hasNext();){
- Entryentry=(Entry)it.next();
- urlConnection.setRequestProperty((String)entry.getKey(),
- (String)entry.getValue());
- }
- if(post!=null){
- OutputStreamWriteroutRemote=newOutputStreamWriter(
- urlConnection.getOutputStream());
- outRemote.write(post);
- outRemote.flush();
- }
- //获得响应状态
- intresponseCode=urlConnection.getResponseCode();
- //获得返回的数据长度
- intresponseLength=urlConnection.getContentLength();
- if(responseCode==302){
- //重定向
- Stringlocation=urlConnection.getHeaderField("Location");
- url=newURL(location);
- foundRedirect=true;
- }else{
- BufferedInputStreamin;
- if(responseCode==200||responseCode==201){
- in=newBufferedInputStream(urlConnection.getInputStream());
- }else{
- in=newBufferedInputStream(urlConnection.getErrorStream());
- }
- intsize=responseLength==-1?4096:responseLength;
- if(encoding!=null){
- responseContent=SimpleWebClient.read(in,size,encoding);
- }else{
- ByteArrayOutputStreamout=newByteArrayOutputStream();
- byte[]bytes=newbyte[size];
- intread;
- while((read=in.read(bytes))>=0){
- out.write(bytes,0,read);
- }
- responseContent=newString(out.toByteArray());
- in.close();
- out.close();
- }
- foundRedirect=false;
- }
- //如果重定向则继续
- }while(foundRedirect);
- returnresponseContent;
- }
- /**
- *转化InputStream为String
- *
- *@paramin
- *@paramsize
- *@return
- *@throwsIOException
- */
- privatestaticStringread(finalInputStreamin,finalintsize,
- finalStringencoding)throwsIOException{
- StringBuildersbr=newStringBuilder();
- intnSize=size;
- if(nSize==0){
- nSize=1;
- }
- char[]buffer=newchar[nSize];
- intoffset=0;
- InputStreamReaderisr=newInputStreamReader(in,encoding);
- while((offset=isr.read(buffer))!=-1){
- sbr.append(buffer,0,offset);
- }
- in.close();
- isr.close();
- returnsbr.toString();
- }
- }
WebAppraise.java
- packageorg.loon.test;
- importjava.io.IOException;
- /**
- *Copyright2008
- *
- *LicensedundertheApacheLicense,Version2.0(the"License");youmaynot
- *usethisfileexceptincompliancewiththeLicense.Youmayobtainacopyof
- *theLicenseat
- *
- *http://www.apache.org/licenses/LICENSE-2.0
- *
- *Unlessrequiredbyapplicablelaworagreedtoinwriting,software
- *distributedundertheLicenseisdistributedonan"ASIS"BASIS,WITHOUT
- *WARRANTIESORCONDITIONSOFANYKIND,eitherexpressorimplied.Seethe
- *Licenseforthespecificlanguagegoverningpermissionsandlimitationsunder
- *theLicense.
- *
- *@projectloonframework
- *@authorchenpeng
- *@email:ceponline@yahoo.com.cn
- *@version0.1
- */
- publicclassWebAppraise{
- privateStringgoogleSum;
- privateStringbaiduSum;
- privateStringmsnSum;
- privateStringaltaVistaSum;
- privateStringallTheWebSum;
- privateStringyahooSum;
- privateStringtestURL;
- publicWebAppraise(finalStringurl){
- if(url!=null&&!"".equals(url)){
- this.testURL=url.trim();
- if(this.testURL.startsWith("http://")){
- this.testURL=this.testURL.substring(7);
- }
- if(this.testURL.startsWith("https://")){
- this.testURL=this.testURL.substring(8);
- }
- }else{
- thrownewRuntimeException("urlisNULL!");
- }
- }
- /**
- *分析指定链接结果,并返回整型数值
- *
- *@paramsearchURL
- *@paramanchor
- *@paramtrail
- *@return
- */
- privatestaticintgetLinks(finalStringsearchURL,finalStringanchor,
- finalStringtrail){
- intcount=0;
- StringserverResponse;
- try{
- //我国特色……
- if(searchURL.startsWith("http://www.baidu.com")){
- //永不离休的gb2312同志(-_-||)
- serverResponse=SimpleWebClient.getRequestHttp(searchURL,
- "gb2312");
- }else{
- serverResponse=SimpleWebClient.getRequestHttp(searchURL);
- }
- }catch(IOExceptione){
- serverResponse=e.getMessage();
- }
- intpos=serverResponse.indexOf(anchor);
- if(pos>1){
- serverResponse=serverResponse.substring(pos+anchor.length());
- pos=serverResponse.indexOf(trail);
- Stringvalue=serverResponse.substring(0,pos).trim();
- value=value.replace(",","");
- value=value.replace(".","");
- count=Integer.parseInt(value);
- }
- returncount;
- }
- publicStringgetAllTheWebSite(){
- returngetAllTheWebSite(false);
- }
- publicStringgetAllTheWebSite(booleanisDomain){
- try{
- StringallTheWeb;
- if(isDomain){
- allTheWeb="http://www.alltheweb.com/search?cat=web&cs=utf8&rys=0&itag=crv&_sb_lang=any&q=linkdomain%3A"
- +this.testURL;
- }else{
- allTheWeb="http://www.alltheweb.com/search?cat=web&cs=utf-8&q=link%3Ahttp%3A%2F%2F"
- +this.testURL+"&_sb_lang=any";
- }
- allTheWebSum=""
- +getLinks(allTheWeb,"<spanclass=\"ofSoMany\">",
- "</span>");
- }catch(Exceptionex){
- allTheWebSum=ex.getMessage();
- }
- returnallTheWebSum;
- }
- publicStringgetAltaVistaSite(){
- returngetAltaVistaSite(false);
- }
- publicStringgetAltaVistaSite(booleanisDomain){
- try{
- StringaltaVista;
- if(isDomain){
- altaVista="http://www.altavista.com/web/results?itag=ody&q=link%3A"
- +this.testURL+"&kgs=0&kls=0";
- }else{
- altaVista="http://www.altavista.com/web/results?itag=ody&kgs=0&kls=0&q=site%3A"
- +this.testURL;
- }
- altaVistaSum=""+getLinks(altaVista,"AltaVistafound","");
- }catch(Exceptionex){
- altaVistaSum=ex.getMessage();
- }
- returnaltaVistaSum;
- }
- publicStringgetGooglePR(){
- returnGooglePageRank.GooglePR(this.testURL);
- }
- publicStringgetGoogleSite(){
- returngetGoogleSite(false);
- }
- publicStringgetGoogleSite(finalbooleanisDomian){
- try{
- Stringgoogle;
- //反向链接
- if(isDomian){
- google="http://www.google.com/search?hl=en&q=link%3A"
- +this.testURL;
- }else{
- google="http://www.google.com/search?hl=en&q=site%3A"
- +this.testURL+"&btnG=Google+Search&aq=f&oq=";
- }
- googleSum=""+getLinks(google,"about<b>","</b>");
- }catch(Exceptionex){
- googleSum=ex.getMessage();
- }
- returngoogleSum;
- }
- publicStringgetBaiduSite(){
- returngetBaiduSite(false);
- }
- publicStringgetBaiduSite(finalbooleanisDomian){
- try{
- Stringbaidu;
- if(isDomian){
- baidu="http://www.baidu.com/s?wd=domain%3A"+this.testURL
- +"&cl=3";
- }else{
- baidu="http://www.baidu.com/s?wd=site%3A"+this.testURL;
- }
- baiduSum=""+getLinks(baidu,"找到相关网页","篇");
- }catch(Exceptionex){
- Stringbaidu;
- if(isDomian){
- baidu="http://www.baidu.com/s?wd=domain%3A"+this.testURL
- +"&cl=3";
- }else{
- baidu="http://www.baidu.com/s?wd=site%3A"+this.testURL;
- }
- baiduSum=""+getLinks(baidu,"找到相关网页约","篇");
- }
- returnbaiduSum;
- }
- publicStringgetYahooSite(){
- returngetYahooSite(false);
- }
- publicStringgetYahooSite(finalbooleanisDomian){
- try{
- Stringyahoo;
- if(isDomian){
- yahoo="http://sitemap.cn.yahoo.com/search?p="+this.testURL
- +"&bwm=i";
- yahooSum=""+getLinks(yahoo,"<strong>","</strong>");
- }else{
- yahoo="http://www.yahoo.cn/s?p=site%3A"+this.testURL
- +"&pid=hp&v=web";
- yahooSum=""+getLinks(yahoo,"找到相关网页约","条");
- }
- }catch(Exceptionex){
- yahooSum=ex.getMessage();
- }
- returnyahooSum;
- }
- publicStringgetMsnSite(){
- returngetMsnSite(false);
- }
- publicStringgetMsnSite(booleanisDomain){
- try{
- Stringmsn;
- if(isDomain){
- msn="http://cnweb.search.live.com/results.aspx?q=link%3A"
- +this.testURL+"&mkt=zh-cn&scope=&FORM=LIVSO";
- }else{
- msn="http://cnweb.search.live.com/results.aspx?q=site%3A"
- +this.testURL+"&go=&form=QBRE";
- }
- msnSum=""+getLinks(msn,"共","条搜索结果");
- }catch(Exceptionex){
- msnSum=ex.getMessage();
- }
- returnmsnSum;
- }
- publicStringgetTestURL(){
- returntestURL;
- }
- }
Test.java
- packageorg.loon.test;
- /**
- *Copyright2008
- *
- *LicensedundertheApacheLicense,Version2.0(the"License");youmaynot
- *usethisfileexceptincompliancewiththeLicense.Youmayobtainacopyof
- *theLicenseat
- *
- *http://www.apache.org/licenses/LICENSE-2.0
- *
- *Unlessrequiredbyapplicablelaworagreedtoinwriting,software
- *distributedundertheLicenseisdistributedonan"ASIS"BASIS,WITHOUT
- *WARRANTIESORCONDITIONSOFANYKIND,eitherexpressorimplied.Seethe
- *Licenseforthespecificlanguagegoverningpermissionsandlimitationsunder
- *theLicense.
- *
- *@projectloonframework
- *@authorchenpeng
- *@email:ceponline@yahoo.com.cn
- *@version0.1
- */
- publicclassTest{
- publicstaticvoidmain(String[]args){
- WebAppraiseappraise=newWebAppraise("http://blog.csdn.net/cping1982");
- System.out.println("GooglePagerRank值:"+appraise.getGooglePR());
- System.out.println("google收录:"+appraise.getGoogleSite());
- System.out.println("google反向收录:"+appraise.getGoogleSite(true));
- System.out.println("yahoo收录:"+appraise.getYahooSite());
- System.out.println("yahoo反向收录:"+appraise.getYahooSite(true));
- System.out.printl