尝试了很多终于爬到了,代码没有整理,很乱,无用代码也比较多。
请手下留情,不要对代码评价,只是提供参考,后续优化后在整理一次上传到资源中。
看不懂的可以留言。
需求:
1根据已有企业名称,查询企业电话,邮箱,等信息。
使用技术:Jsoup。
IP代理地址:豌豆IP(需要去注册,自行百度)
实现步骤:1调用豌豆接口获取代理IP,2读取excel中提前准备好的企业名单。3设置代理IP进行抓取企业信息。4抓取结果写入TXT
public static void main(String[] args) throws Exception {
JSONObject res = null;
List<WanDouIP> listIp = new ArrayList<WanDouIP>();
res = getIplist("http://api.wandoudl.com/api/ip", "豌豆APP-key");
System.out.println(res.get("code"));
String proxyUser = "豌豆账号";
String proxyPwd = "豌豆密码";
jiexiIp(res,listIp);//抓取到豌豆代理IP
ReadExcelUtils.ReadExcel("D:\\51java81.xls");
List<String[]> list = ReadExcelUtils.readExcelContent();//读取excel中的数据,
for (int i=0;i<list.size();i++) {
WanDouIP wd = getIp(listIp);
if (wd == null || wd.getIp() == null || wd.getPort() == null) {
res = getIplist("http://api.wandoudl.com/api/ip","豌豆APP-key");
jiexiIp(res,listIp);
wd = getIp(listIp);
}
Authenticator.setDefault(new ProxyAuthenticator(proxyUser, proxyPwd));
InetSocketAddress addr = new InetSocketAddress(wd.getIp(), Integer.parseInt(wd.getPort()));
Proxy proxy = new Proxy(Proxy.Type.HTTP, addr);
//注意下面的路径参数,不可缺少。proxy,list.get(i)[0].trim()岗位 proxy,list.get(3)[0].trim(),公司名称。我自己的业务,可以删除。
String str = HttpLogin.qichacha("https://www.qichacha.com/search_index?key="+java.net.URLEncoder.encode(list.get(i)[3].trim(),"UTF- 8")+"&ajaxflag=1&p=1&ajaxflag=1",proxy,list.get(i)[0].trim(),list.get(i)[3].trim());
if (str == null) {
continue;
}
FileWriter fw = null;
try {
//将抓取的数据打印到txt中。
File f=new File("E:\\dd82.txt");
fw = new FileWriter(f, true);
} catch (IOException e) {
e.printStackTrace();
}
PrintWriter pw = new PrintWriter(fw);
pw.println(str);
pw.flush();
try {
fw.flush();
pw.close();
fw.close();
} catch (IOException e) {
e.printStackTrace();
}
Thread.sleep(3000);//记得睡一会,免得被封。
}
}
private static void jiexiIp(JSONObject res,List<WanDouIP> listIp) {
JSONArray jsonArray = new JSONArray();
jsonArray = JSONArray.fromObject(res.get("data"));
for (int i=0;i<jsonArray.size();i++) {
JSONObject res1 = jsonArray.getJSONObject(i);
WanDouIP wd= new WanDouIP();
wd.setIp(res1.get("ip").toString());
wd.setPort(res1.get("port").toString());
wd.setExpire_time(res1.get("expire_time").toString());
listIp.add(wd);
}
}
static class ProxyAuthenticator extends Authenticator {
private String authUser, authPwd;
public ProxyAuthenticator(String authUser, String authPwd) {
this.authUser = authUser;
this.authPwd = authPwd;
}
public PasswordAuthentication getPasswordAuthentication() {
return (new PasswordAuthentication(authUser, authPwd.toCharArray()));
}
}
private static JSONObject getIplist(String path,String post) {
String result = null;
try {
CloseableHttpClient httpclient = HttpClients.createDefault();
HttpPost httppost = new HttpPost(path);
httppost.addHeader("Content-Type",
"application/x-www-form-urlencoded; charset=utf-8");
String textMsg = post;
StringEntity se = new StringEntity(textMsg, "utf-8");
httppost.setEntity(se);
org.apache.http.HttpResponse response = httpclient
.execute(httppost);
if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
result = EntityUtils.toString(response.getEntity(), "utf-8");
} else {
result = EntityUtils.toString(response.getEntity(), "utf-8");
}
} catch (Exception e) {
}
return JSONObject.fromObject(result);
}
static WanDouIP getIp(List<WanDouIP> listIp){
try {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss");
Random random = new Random();
int n = random.nextInt(listIp.size());
if (listIp.size() == 0) {
res = getIplist("http://api.wandoudl.com/api/ip","豌豆APP-key");
WanDouIP wd = listIp.get(n);
Date dt1 = sdf.parse(wd.getExpire_time());
Date dt2 = new Date();
if (dt1.getTime() < dt2.getTime()) {
listIp.remove(n);
getIp(listIp);
}
return wd;
} catch (Exception e) {
// TODO: handle exception
}
return null;
}
IP池
public class WanDouIP {
private String ip;
private String port;
private String expire_time;
public String getIp() {
return ip;
}
public void setIp(String ip) {
this.ip = ip;
}
public String getPort() {
return port;
}
public void setPort(String port) {
this.port = port;
}
public String getExpire_time() {
return expire_time;
}
public void setExpire_time(String expire_time) {
this.expire_time = expire_time;
}
}
爬取qichacha
public static String qichacha(String key,Proxy proxy,String gangwei,String companynameold) {
String result = "";
String temp = "";
BufferedReader in = null;
try {
URL realUrl = new URL(key);
// 打开和URL之间的连接
URLConnection connection = realUrl.openConnection();
// 设置通用的请求属性
connection.setRequestProperty("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3");
// connection.addRequestProperty("encoding", "GBK");
connection.setRequestProperty("connection", "Keep-Alive");
// connection.setRequestProperty(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler());
// connection.setRequestProperty("user-agent", UserAgetList.USER_AGET_RANDOM());
connection.setRequestProperty("Cookie", "换你自己的cookie,在浏览器查");
connection.setRequestProperty("User-Agent","换你自己的,浏览器里查");
// // 设置 get 请求超时 5s
// getMethod.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, 5000);
// // 设置请求重试处理
// getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler());
// 建立实际的连接
connection.connect();
// 获取所有响应头字段
// Map<String, List<String>> map = connection.getHeaderFields();
// 遍历所有的响应头字段
// for (String key : map.keySet()) {
// System.out.println(key + "--->" + map.get(key));
// }
// 定义 BufferedReader输入流来读取URL的响应
in = new BufferedReader(new InputStreamReader(
connection.getInputStream(),"utf-8"));
String line;
while ((line = in.readLine()) != null) {
result += line;
}
String dataString = result;
Document doc = Jsoup.parseBodyFragment(result);
Element body = doc.body();
Element itemElement = body.getElementsByTag("tbody").first();
Elements list = null;
try {
list = itemElement.getElementsByTag("tr");
} catch (Exception e) {
System.out.println("赶紧访问企查查去点击一下认证下你不是机器人");
}
if (list ==null || list.size() == 0) {
return null;
}
for (Element element : list) {
String companyName = element.getElementsByTag("a").first().text();
String name = "";
String oldname = companynameold;
String gw = gangwei;
String price = "";
String time = "";
String email = "";
String call = "";
String address = "";
Elements item1 = element.getElementsByClass("m-t-xs");
Element urls = element.getElementsByClass("ma_h1").first();
String statustd = element.getElementsByClass("statustd").first().getElementsByTag("span").first()
.text();
// if (!statustd.equals("在业")) {
// continue;
// }
String text = urls.attr("href");
for (Element element2 : item1) {
String t = element2.text();
if (t.contains("注册资本") || t.contains("成立日期")) {
try {
name = element2.getElementsByTag("a").first().text();
} catch (Exception e) {
try {
// TODO: handle exception
name = t.substring(t.indexOf(":") + 1,
t.indexOf("注册资本:"));
} catch (Exception e2) {
}
}
price = t.substring(t.indexOf("注册资本:") + "注册资本:".length(), t.indexOf("成立日期"));
time = t.substring(t.indexOf("成立日期:") + "成立日期:".length());
}
if (t.contains("邮箱")) {
email = t.substring("邮箱:".length(), t.indexOf("电话"));
call = t.substring(t.indexOf("电话:") + "电话:".length());
if (call.contains("更多号码"))
call = call.replace("更多号码", "");
}
if (t.contains("地址")) {
address = t.substring("地址:".length());
}
// System.out.println(name);
}
// String companyName = element.getElementsByTag("a").first().text();
//判断我查询的企业是否是找到的企业
if (companyName.contains(oldname)) {
// if (email.trim().length() != 0) {
System.out.println(gw+ "\t" +oldname+ "\t" +companyName + "\t" + name + "\t" + price + "\t" + time + "\t" + email + "\t" + call
+ "\t" + address + "\t");
temp += gw+ "---" +oldname+ "---" +companyName + "---" + name + "---" + price + "---" + time + "---" + email + "---" + call
+ "---" + address;
// }
break;
}
}
} catch (HttpException e) {
// 发生致命的异常,可能是协议不对或者返回的内容有问题
System.out.println("Please check your provided http address!");
e.printStackTrace();
} catch (IOException e) {
// 发生网络异常
e.printStackTrace();
// getGenerateExcel();
} finally {
// 释放连接
// getMethod.releaseConnection();
}
return temp;
}
poi读excel工具
public class ReadExcelUtils {
private static Workbook wb;
private static Sheet sheet;
private static Row row;
public static void ReadExcel(String filepath) {
if(filepath==null){
return;
}
String ext = filepath.substring(filepath.lastIndexOf("."));
try {
InputStream is = new FileInputStream(filepath);
if(".xls".equals(ext)){
wb = new HSSFWorkbook(is);
}else if(".xlsx".equals(ext)){
wb = new XSSFWorkbook(is);
}else{
wb=null;
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 读取Excel表格表头的内容
* @return String 表头内容的数组
*/
public String[] readExcelTitle() throws Exception{
if(wb==null){
throw new Exception("Workbook对象为空!");
}
sheet = (Sheet) wb.getSheetAt(0);
row = sheet.getRow(0);
// 标题总列数
int colNum = row.getPhysicalNumberOfCells();
String[] title = new String[colNum];
for (int i = 0; i < colNum; i++) {
// title[i] = getStringCellValue(row.getCell((short) i));
title[i] = row.getCell(i).getStringCellValue();
}
return title;
}
/**
* 读取Excel数据内容
* @return Map 包含单元格数据内容的Map对象
*/
public static List<String[]> readExcelContent() throws Exception{
if(wb==null){
throw new Exception("Workbook对象为空!");
}
List<String[]> list = new ArrayList<String[]>();
sheet = wb.getSheetAt(0);
// 得到总行数
int rowNum = sheet.getLastRowNum();
row = sheet.getRow(0);
int colNum = row.getPhysicalNumberOfCells();
// 正文内容应该从第二行开始,第一行为表头的标题
for (int i = 1; i <= rowNum; i++) {
row = sheet.getRow(i);
int j = 0;
Map<Integer,Object> cellValue = new HashMap<Integer, Object>();
String str[] =new String[4];
while (j < colNum) {
str[j] = (String) getCellFormatValue(row.getCell(j));
Object obj = getCellFormatValue(row.getCell(j));
cellValue.put(j, obj);
j++;
}
list.add(str);
// content.put(i, cellValue);
}
System.out.println("excel读取==========="+list.size());
return list;
}
/**
* 根据Cell类型设置数据
* @param cell
* @return Object
*/
private static Object getCellFormatValue(Cell cell) {
Object cellvalue = "";
if (cell != null) {
// 判断当前Cell的Type
switch (cell.getCellType()) {
case Cell.CELL_TYPE_NUMERIC:// 如果当前Cell的Type为NUMERIC
case Cell.CELL_TYPE_FORMULA: {
// 判断当前的cell是否为Date
if (DateUtil.isCellDateFormatted(cell)) {
// 如果是Date类型则,转化为Data格式
// data格式是带时分秒的:2013-7-10 0:00:00
// cellvalue = cell.getDateCellValue().toLocaleString();
// data格式是不带带时分秒的:2013-7-10
Date date = cell.getDateCellValue();
cellvalue = date;
} else {
// 如果是纯数字
// 取得当前Cell的数值
cellvalue = String.valueOf(cell.getNumericCellValue());
}
break;
}
case Cell.CELL_TYPE_STRING:// 如果当前Cell的Type为STRING
// 取得当前的Cell字符串
cellvalue = cell.getRichStringCellValue().getString();
break;
default:// 默认的Cell值
cellvalue = "";
}
} else {
cellvalue = "";
}
return cellvalue;
}
}
抓取结果