一个小爬虫和正则表达式的例子,用于获取2015年迅雷校招的笔试名单

笔记:

jsoup.jar包常用语制作网页爬虫,它的使用只需要导入jsoup.jar这一个包就行,它的使用参考点击打开链接。通常解析web也伴随着正则表达式的使用,正则表达式group的概念参考点击打开链接

import java.io.IOException;
import java.lang.reflect.Array;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class JsoupTest {
  
	int count;
	
	int OPM;
	
	int Cplusplus;
	
	int SystemOperationEngineer;
	
	int  DataDevelopmentEngineer;
	
	int PM;
	
	int VisualDesigner;
	
	int  WebReconstructionEngineer;
	
	int InteractionDesigner;
	
	int BusinessSpecialist;
	
	int city;
	
	ArrayList <WHUT> WHUTStudent;
   
      	public  void get(int city) throws IOException  {		
      		
      		count=1;
      		
      		OPM=0;
      		
      		Cplusplus=0;
      		
      		SystemOperationEngineer=0;
      		
      		DataDevelopmentEngineer=0;
      		
      		PM=0;
      		
      		VisualDesigner=0;
      		
      		WebReconstructionEngineer=0;
      		
      		InteractionDesigner=0;
      		
      		BusinessSpecialist=0;
      		
      		this.city=city;
      		
      		 WHUTStudent=new ArrayList <WHUT>();
    	  
        		Document doc = Jsoup.connect("http://svr.campus.xunlei.com/viewlist?callback=jQuery110209096807448659092_1444288837792&city="+city+"&from=0&to=100000&name=-1&_=0").get();
        		
        		String value=doc.toString();
        		 
        		Pattern pattern_name = Pattern.compile(""name":"(.+?)",");
        		 
        		Matcher macher_name = pattern_name.matcher(value);
        		
        		Pattern pattern_position = Pattern.compile("position":"(.+?)",");
       		 
        		Matcher macher_position = pattern_position.matcher(value);
        		
        		Pattern pattern_school = Pattern.compile("school":"(.+?)",");
       		 
        		Matcher macher_school = pattern_school.matcher(value);
        		
        		Pattern pattern_major = Pattern.compile("major":"(.+?)",");
       		 
        		Matcher macher_major = pattern_major.matcher(value);
        		
        		Pattern pattern_time = Pattern.compile("time":"(.+?)",");
       		 
        		Matcher macher_time = pattern_time.matcher(value);
        		 
        		while(macher_name.find()&&macher_position.find()&&macher_school.find()&&macher_major.find()&&macher_time.find())
        		{	
        			
        			if(city==-1){
        				System.out.println("编号:"+count);	
        				System.out.println(macher_name.group(1));
        				System.out.println(macher_position.group(1));
        				System.out.println(macher_school.group(1));
        				System.out.println(macher_major.group(1));
        				System.out.println(macher_time.group(1));
        				System.out.println("************************************");
        			}
        			
        		count++;
        		
        		if(macher_position.group(1).equals("运营产品经理"))	OPM++;

        		if(macher_position.group(1).equals("C++开发工程师"))	Cplusplus++;
        		
        		if(macher_position.group(1).equals("系统运维工程师"))	SystemOperationEngineer++;

        		if(macher_position.group(1).equals("数据开发工程师"))	DataDevelopmentEngineer++;
        		
        		if(macher_position.group(1).equals("产品经理"))	PM++;

        		if(macher_position.group(1).equals("视觉设计师"))	VisualDesigner++;

        		if(macher_position.group(1).equals("网页重构工程师"))	WebReconstructionEngineer++;

        		if(macher_position.group(1).equals("交互设计师")) InteractionDesigner++;

        		if(macher_position.group(1).equals("商务专员")) 	BusinessSpecialist++;
        		
        		if(macher_school.group(1).equals("武汉理工大学")) {
        			
        			WHUT tmp=new WHUT();
        			
        			tmp.name=macher_name.group(1);
        			
        			tmp.position=macher_position.group(1);
        			
        			tmp.school=macher_school.group(1);
        			
        			tmp.major=macher_major.group(1);
        			
        			tmp.time=macher_time.group(1);
        			
        			WHUTStudent.add(tmp);
        			
        		}

        	}
        		
        		  		
      	}
      	
    	public void print(){
    		switch(city){
    		
    		case -1:System.out.println("全国:");break;
    		
    		case 12:System.out.println("西安:");break;
    		
    		case 11:System.out.println("成都:");break;
    		
    		case 8:System.out.println("武汉:");break;
    		
    		case 4: System.out.println("广州:");break;
    		}
    		
    		
    		
    		System.out.println("商务专员:"+BusinessSpecialist+"人");
    		System.out.println("产品经理:"+PM+"人");
    		System.out.println("视觉设计师:"+VisualDesigner+"人");
    		System.out.println("交互设计师:"+InteractionDesigner+"人");
    		System.out.println("运营产品经理:"+OPM+"人");
    		System.out.println("C++开发工程师:"+Cplusplus+"人");
    		System.out.println("网页重构工程师:"+WebReconstructionEngineer+"人");
    		System.out.println("系统运维工程师:"+SystemOperationEngineer+"人");
    		System.out.println("数据开发工程师:"+DataDevelopmentEngineer+"人");
    		System.out.println("************************************");
    		
    		if(city==8){
    			
    			System.out.println("其中武汉理工大学的学生有:"+WHUTStudent.size()+"人");
    			
    			for(int i=0;i<WHUTStudent.size();i++)
    				
    				{System.out.println(WHUTStudent.get(i).name);
    				System.out.println(WHUTStudent.get(i).position);
    				System.out.println(WHUTStudent.get(i).major);
    				System.out.println(WHUTStudent.get(i).school);
    				
    				System.out.println("#########################");
    				}
    			
    			System.out.println("************************************");
    			
    		}
    }
				public static void main(String[] args) throws Exception { 
        			 
					JsoupTest  wholeCountry=new  JsoupTest ();
					
					 wholeCountry.get(-1);//全国
					
					 wholeCountry.print();
					 wholeCountry.get(12);//西安
						
					 wholeCountry.print();
					 wholeCountry.get(11);//成都
						
					 wholeCountry.print();
					 wholeCountry.get(8);//武汉
						
					 wholeCountry.print();
					 wholeCountry.get(4);//广州
						
					 wholeCountry.print();
					 
        		 }



}

public class WHUT {

	String name;
	
	String position; 
	
	String school;
	
	String major;
	
	String time ;
	 
}



                
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值