java爬企查查

尝试了很多终于爬到了,代码没有整理,很乱,无用代码也比较多。
请手下留情,不要对代码评价,只是提供参考,后续优化后在整理一次上传到资源中。
看不懂的可以留言。

需求:
1根据已有企业名称,查询企业电话,邮箱,等信息。
使用技术:Jsoup。
IP代理地址:豌豆IP(需要去注册,自行百度)

实现步骤:1调用豌豆接口获取代理IP,2读取excel中提前准备好的企业名单。3设置代理IP进行抓取企业信息。4抓取结果写入TXT

public static void main(String[] args) throws Exception {
		 JSONObject res = null; 
		 List<WanDouIP> listIp = new ArrayList<WanDouIP>();
		 res = 	getIplist("http://api.wandoudl.com/api/ip", "豌豆APP-key");
		 System.out.println(res.get("code"));
		 String proxyUser = "豌豆账号";
		 String proxyPwd = "豌豆密码";
		 jiexiIp(res,listIp);//抓取到豌豆代理IP
      	ReadExcelUtils.ReadExcel("D:\\51java81.xls");
		List<String[]> list = ReadExcelUtils.readExcelContent();//读取excel中的数据,
        for (int i=0;i<list.size();i++) {
			WanDouIP wd = getIp(listIp);
			if (wd == null || wd.getIp() == null || wd.getPort() == null) {
					 res = getIplist("http://api.wandoudl.com/api/ip","豌豆APP-key");
					  jiexiIp(res,listIp);
					  wd = getIp(listIp);
				}
	        	 Authenticator.setDefault(new ProxyAuthenticator(proxyUser, proxyPwd));
				 InetSocketAddress addr = new InetSocketAddress(wd.getIp(), Integer.parseInt(wd.getPort()));
				 Proxy proxy = new Proxy(Proxy.Type.HTTP, addr);
				 //注意下面的路径参数,不可缺少。proxy,list.get(i)[0].trim()岗位 proxy,list.get(3)[0].trim(),公司名称。我自己的业务,可以删除。
	        	String str = HttpLogin.qichacha("https://www.qichacha.com/search_index?key="+java.net.URLEncoder.encode(list.get(i)[3].trim(),"UTF-	8")+"&ajaxflag=1&p=1&ajaxflag=1",proxy,list.get(i)[0].trim(),list.get(i)[3].trim());
				if (str == null) {
					continue;
				}
				FileWriter fw = null;
				try {
					//将抓取的数据打印到txt中。
					File f=new File("E:\\dd82.txt");
					fw = new FileWriter(f, true);
				} catch (IOException e) {
					e.printStackTrace();
				}
				PrintWriter pw = new PrintWriter(fw);
				pw.println(str);
				pw.flush();
				try {
					fw.flush();
					pw.close();
					fw.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
				Thread.sleep(3000);//记得睡一会,免得被封。
	        }
	}
	private static void jiexiIp(JSONObject res,List<WanDouIP> listIp) {
		 JSONArray jsonArray = new JSONArray();
		 jsonArray = JSONArray.fromObject(res.get("data"));
		 for (int i=0;i<jsonArray.size();i++) {
			 JSONObject res1 = jsonArray.getJSONObject(i);
			 WanDouIP wd= new WanDouIP();
			 wd.setIp(res1.get("ip").toString());
			 wd.setPort(res1.get("port").toString());
			 wd.setExpire_time(res1.get("expire_time").toString());
			 listIp.add(wd);
		 }
	}
	static class ProxyAuthenticator extends Authenticator {
		private String authUser, authPwd;
		
		public ProxyAuthenticator(String authUser, String authPwd) {
			this.authUser = authUser;
			this.authPwd = authPwd;
		}
		
        public PasswordAuthentication getPasswordAuthentication() {
            return (new PasswordAuthentication(authUser, authPwd.toCharArray()));
        }
    }
	private static JSONObject  getIplist(String path,String post) {
		 String result = null;
		 try {
			CloseableHttpClient httpclient = HttpClients.createDefault();
			HttpPost httppost = new HttpPost(path);
			httppost.addHeader("Content-Type",
					"application/x-www-form-urlencoded; charset=utf-8");
			String textMsg = post;
			StringEntity se = new StringEntity(textMsg, "utf-8");
			httppost.setEntity(se);
		
			org.apache.http.HttpResponse response = httpclient
					.execute(httppost);
			if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
				result = EntityUtils.toString(response.getEntity(), "utf-8");
			} else {
				result = EntityUtils.toString(response.getEntity(), "utf-8");
			}
		} catch (Exception e) {
			
		}
		return JSONObject.fromObject(result); 
	}
	
	static WanDouIP getIp(List<WanDouIP> listIp){
		try {
			SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss");
			Random random = new Random();
			int n = random.nextInt(listIp.size());
			if (listIp.size() == 0) {
			 res = getIplist("http://api.wandoudl.com/api/ip","豌豆APP-key");
			WanDouIP wd = listIp.get(n);
			Date dt1 = sdf.parse(wd.getExpire_time());
			Date dt2 = new Date();
			if (dt1.getTime() < dt2.getTime()) {
				listIp.remove(n);
				getIp(listIp);
			} 
			return wd;
		} catch (Exception e) {
			// TODO: handle exception
		}
		return null;
	}

IP池

public class WanDouIP {
	private String ip;
	private String port;
	private String expire_time;
	
	public String getIp() {
		return ip;
	}
	public void setIp(String ip) {
		this.ip = ip;
	}
	public String getPort() {
		return port;
	}
	public void setPort(String port) {
		this.port = port;
	}
	public String getExpire_time() {
		return expire_time;
	}
	public void setExpire_time(String expire_time) {
		this.expire_time = expire_time;
	}
	
}

爬取qichacha

public static String qichacha(String key,Proxy proxy,String gangwei,String companynameold) {
		String result = "";  
		String temp = "";
        BufferedReader in = null;  
			try {
					URL realUrl = new URL(key);  
		            // 打开和URL之间的连接  
		            URLConnection connection = realUrl.openConnection();  
		            // 设置通用的请求属性  
		            connection.setRequestProperty("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3");  
//		            connection.addRequestProperty("encoding", "GBK");
		            connection.setRequestProperty("connection", "Keep-Alive");  
//		            connection.setRequestProperty(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler());  
//		            connection.setRequestProperty("user-agent", UserAgetList.USER_AGET_RANDOM());  
		            connection.setRequestProperty("Cookie", "换你自己的cookie,在浏览器查");  
		            connection.setRequestProperty("User-Agent","换你自己的,浏览器里查");
//		    		// 设置 get 请求超时 5s
//		    		getMethod.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, 5000);
//		    		// 设置请求重试处理
//		    		getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler());
		    		
		            // 建立实际的连接  
		            connection.connect();  
		            // 获取所有响应头字段  
//		            Map<String, List<String>> map = connection.getHeaderFields();  
		            // 遍历所有的响应头字段  
//		            for (String key : map.keySet()) {  
//		                System.out.println(key + "--->" + map.get(key));  
//		            }  

		    
		            // 定义 BufferedReader输入流来读取URL的响应  
		            in = new BufferedReader(new InputStreamReader(  
		                    connection.getInputStream(),"utf-8"));  
		            String line;  
		            while ((line = in.readLine()) != null) {  
		                result += line;  
		            }  
		            String dataString = result;
					Document doc = Jsoup.parseBodyFragment(result);

					Element body = doc.body();
					Element itemElement = body.getElementsByTag("tbody").first();
					Elements list = null;
					try {
						list = itemElement.getElementsByTag("tr");
					} catch (Exception e) {
						System.out.println("赶紧访问企查查去点击一下认证下你不是机器人");
					}
					if (list ==null || list.size() == 0) {
						return null;
					}
					for (Element element : list) {
						
						String companyName = element.getElementsByTag("a").first().text();
						String name = "";
						String oldname = companynameold;
						String gw = gangwei;
						String price = "";
						String time = "";
						String email = "";
						String call = "";
						String address = "";
						Elements item1 = element.getElementsByClass("m-t-xs");
						Element urls = element.getElementsByClass("ma_h1").first();
						String statustd = element.getElementsByClass("statustd").first().getElementsByTag("span").first()
								.text();
//						if (!statustd.equals("在业")) {
//							continue;
//						}
						String text = urls.attr("href");
						
						for (Element element2 : item1) {
							String t = element2.text();
							if (t.contains("注册资本") || t.contains("成立日期")) {
								try {
									name = element2.getElementsByTag("a").first().text();
								} catch (Exception e) {
									try {
										// TODO: handle exception
										name = t.substring(t.indexOf(":") + 1,
												t.indexOf("注册资本:"));
									} catch (Exception e2) {
										
									}
								}

								price = t.substring(t.indexOf("注册资本:") + "注册资本:".length(), t.indexOf("成立日期"));
								time = t.substring(t.indexOf("成立日期:") + "成立日期:".length());
							}
							if (t.contains("邮箱")) {
								email = t.substring("邮箱:".length(), t.indexOf("电话"));
								call = t.substring(t.indexOf("电话:") + "电话:".length());
								if (call.contains("更多号码"))
									call = call.replace("更多号码", "");
							}
							if (t.contains("地址")) {
								address = t.substring("地址:".length());

							}
							// System.out.println(name);
						}
						// String companyName = element.getElementsByTag("a").first().text();
						//判断我查询的企业是否是找到的企业
						if (companyName.contains(oldname)) {
//							if (email.trim().length() != 0) {
								System.out.println(gw+ "\t" +oldname+ "\t" +companyName + "\t" + name + "\t" + price + "\t" + time + "\t" + email + "\t" + call
										+ "\t" + address + "\t");
								temp += gw+ "---" +oldname+ "---" +companyName + "---" + name + "---" + price + "---" + time + "---" + email + "---" + call
										+ "---" + address;
//							}
							break;
						}
						
					}

				} catch (HttpException e) {
					// 发生致命的异常,可能是协议不对或者返回的内容有问题
					System.out.println("Please check your provided http address!");
					e.printStackTrace();
				} catch (IOException e) {
					// 发生网络异常
					e.printStackTrace();
//					getGenerateExcel();
				} finally {
					// 释放连接
//					getMethod.releaseConnection();
				}
		            
		       return temp;
	}

poi读excel工具

public class ReadExcelUtils {

    private static Workbook wb;
    private static Sheet sheet;
    private static Row row;

    public static void ReadExcel(String filepath) {
        if(filepath==null){
            return;
        }
        String ext = filepath.substring(filepath.lastIndexOf("."));
        try {
            InputStream is = new FileInputStream(filepath);
            if(".xls".equals(ext)){
                wb = new HSSFWorkbook(is);
            }else if(".xlsx".equals(ext)){
                wb = new XSSFWorkbook(is);
            }else{
                wb=null;
            }
        } catch (FileNotFoundException e) {
        	e.printStackTrace();
        } catch (IOException e) {
        	e.printStackTrace();
        }
    }


    /**
     * 读取Excel表格表头的内容
     * @return String 表头内容的数组
     */
    public String[] readExcelTitle() throws Exception{
        if(wb==null){
            throw new Exception("Workbook对象为空!");
        }
        sheet = (Sheet) wb.getSheetAt(0);
        row = sheet.getRow(0);
        // 标题总列数
        int colNum = row.getPhysicalNumberOfCells();

        String[] title = new String[colNum];
        for (int i = 0; i < colNum; i++) {
            // title[i] = getStringCellValue(row.getCell((short) i));
            title[i] = row.getCell(i).getStringCellValue();
        }
        return title;
    }

    /**
     * 读取Excel数据内容
     * @return Map 包含单元格数据内容的Map对象
     */
    public static  List<String[]>  readExcelContent() throws Exception{
        if(wb==null){
            throw new Exception("Workbook对象为空!");
        }
       List<String[]> list = new ArrayList<String[]>();

        sheet = wb.getSheetAt(0);
        // 得到总行数
        int rowNum = sheet.getLastRowNum();
        row = sheet.getRow(0);
        int colNum = row.getPhysicalNumberOfCells();

        // 正文内容应该从第二行开始,第一行为表头的标题
        for (int i = 1; i <= rowNum; i++) {
            row = sheet.getRow(i);
            int j = 0;
            Map<Integer,Object> cellValue = new HashMap<Integer, Object>();
            String str[] =new String[4];
            while (j < colNum) {
            	str[j] = (String) getCellFormatValue(row.getCell(j));
                Object obj = getCellFormatValue(row.getCell(j));
                cellValue.put(j, obj);
                j++;
            }
            list.add(str);
//            content.put(i, cellValue);
        }
        System.out.println("excel读取==========="+list.size());
        return list;
    }

    /**
     * 根据Cell类型设置数据
     * @param cell
     * @return Object
     */
    private  static Object getCellFormatValue(Cell cell) {
        Object cellvalue = "";
        if (cell != null) {

            // 判断当前Cell的Type
            switch (cell.getCellType()) {
                case Cell.CELL_TYPE_NUMERIC:// 如果当前Cell的Type为NUMERIC
                case Cell.CELL_TYPE_FORMULA: {
                    // 判断当前的cell是否为Date
                    if (DateUtil.isCellDateFormatted(cell)) {
                        // 如果是Date类型则,转化为Data格式
                        // data格式是带时分秒的:2013-7-10 0:00:00
                        // cellvalue = cell.getDateCellValue().toLocaleString();


                        // data格式是不带带时分秒的:2013-7-10
                        Date date = cell.getDateCellValue();
                        cellvalue = date;
                    } else {
                        // 如果是纯数字
                        // 取得当前Cell的数值
                        cellvalue = String.valueOf(cell.getNumericCellValue());
                    }
                    break;
                }
                case Cell.CELL_TYPE_STRING:// 如果当前Cell的Type为STRING
                    // 取得当前的Cell字符串
                    cellvalue = cell.getRichStringCellValue().getString();
                    break;
                default:// 默认的Cell值
                    cellvalue = "";
            }
        } else {
            cellvalue = "";
        }
        return cellvalue;
    }
}

抓取结果
在这里插入图片描述

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值