1.工具备用
package reptile;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.RandomAccessFile;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.nio.channels.FileChannel;
import java.nio.channels.FileLock;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicInteger;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
/**
* 京东服务地址
* @author daiyang
*
*/
public class Reptile4 {
public static int i = 0;
public static int j = 0;
public static void main(String[] args) throws Exception {
//解析省编码
String jdProvince = cover("\u5317\u4eac|1|72|1,\u4e0a\u6d77|2|78|1,\u5929\u6d25|3|51035|1,"
+ "\u91cd\u5e86|4|113|1,\u6cb3\u5317|5|142,\u5c71\u897f|6|303,\u6cb3\u5357|7|412,"
+ "\u8fbd\u5b81|8|560,\u5409\u6797|9|639,\u9ed1\u9f99\u6c5f|10|698,\u5185\u8499\u53e4|11|799,"
+ "\u6c5f\u82cf|12|904,\u5c71\u4e1c|13|1000,\u5b89\u5fbd|14|1116,\u6d59\u6c5f|15|1158,\u798f\u5efa|16|1303,"
+ "\u6e56\u5317|17|1381,\u6e56\u5357|18|1482,\u5e7f\u4e1c|19|1601,\u5e7f\u897f|20|1715,\u6c5f\u897f|21|1827,"
+ "\u56db\u5ddd|22|1930,\u6d77\u5357|23|2121,\u8d35\u5dde|24|2144,\u4e91\u5357|25|2235,\u897f\u85cf|26|2951,"
+ "\u9655\u897f|27|2376,\u7518\u8083|28|2487,\u9752\u6d77|29|2580,\u5b81\u590f|30|2628,\u65b0\u7586|31|2652,"
+ "\u6e2f\u6fb3|52993|52994,\u53f0\u6e7e|32|2768,\u9493\u9c7c\u5c9b|84|84");
//读取市源数据
String unicodeCity = readFile("D:\\test\\city.txt");
//解析市编码
String jdCity = cover(unicodeCity);
//获取省数据
List<Map<String,Object>> provinceList = provinceDataHandle(jdProvince);
//获取市数据
List<Map<String, Object>> cityList= cityDataHandle(jdCity);
//线程安全的区县数据
ConcurrentLinkedQueue<Map<String,Object>> districtList = new ConcurrentLinkedQueue<Map<String,Object>>();
//线程安全的乡镇数据
ConcurrentLinkedQueue<Map<String,Object>> courtList = new ConcurrentLinkedQueue<Map<String,Object>>();
//数据处理
dataHandle(provinceList, cityList, districtList);
System.out.println(JSON.toJSON(provinceList));
System.out.println(JSON.toJSON(cityList));
System.out.println(JSON.toJSON(districtList));
//开始表演---->>>>市Id请求
AtomicInteger atoI = new AtomicInteger(0); //查询哪个市下得区县乡镇数据,成都市列表第325个,绵阳市第329个
//容量上限为50的线程池
ExecutorService es = Executors.newFixedThreadPool(50);
System.out.println("===========>>>>>>>>>>>>>>>>>>开始搜索数据");
int taskNum = 1;//
while(taskNum<=cityList.size()){//cityList.size()------------------------------------->>>开闸
Runnable task = new Runnable() {
@Override
public void run() {
getDistrictInfo(courtList,districtList,cityList, atoI);
}
};
es.submit(task);
taskNum++;
}
es.shutdown();
while(true){
if(es.isTerminated()){
System.out.println("---END---\n");
System.out.println("所有的子线程都结束了!");
//*************************************数据正确处理*******************************************//*
System.out.println("=================>>>>>>>>>>>>>>>开始存入数据库");
//addProvinceData(provinceList);//添加省数据
//addCityData(cityList);//添加市数据
//addDistructData(districtList);//添加区县数据 3600多个
addTownData(courtList);//添加乡镇数据 39836个
break;
}
Thread.sleep(1000);
}
}
static Connection conn;
static PreparedStatement ps;
static ResultSet rs;
/**
* 写一个连接数据库的方法
*/
public static Connection getConnection(){
String url="jdbc:mysql://localhost:port/database";
String userName="username";
String password="password";
try {
Class.forName("com.mysql.jdbc.Driver");
} catch (ClassNotFoundException e) {
System.out.println("找不到驱动!");
e.printStackTrace();
}
try {
conn=DriverManager.getConnection(url, userName, password);
if(conn!=null){
System.out.println("connection successful");
}
} catch (SQLException e) {
// TODO Auto-generated catch block
System.out.println( "connection fail");
e.printStackTrace();
}
return conn;
}
public static int addTownData(ConcurrentLinkedQueue<Map<String,Object>> list){
int row=0;
String sql="insert into tb_town(name,districtId,jdTownId) values(?,?,?)";
try {
conn=getConnection();//连接数据库
ps=conn.prepareStatement(sql);// 2.创建Satement并设置参数
// rs=ps.executeQuery(); // 3.ִ执行SQL语句,緊緊用于查找語句
//sql語句中寫了幾個字段,下面就必須要有幾個字段
for(Map<String,Object> map:list){
System.out.println("FBIWARNING i....:"+(i++));
ps.setString(1, (String)map.get("name"));
ps.setInt(2, Integer.valueOf((String)map.get("districtId")));
ps.setInt(3, Integer.valueOf((String)map.get("id")));
// 4.处理结果集
row=ps.executeUpdate();
}
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally{
try {
ps.close();
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
return row;
}
public static int addDistructData(ConcurrentLinkedQueue<Map<String,Object>> list){
int row=0;
String sql="insert into tb_district(name,cityId,jdDistrictId) values(?,?,?)";
try {
conn=getConnection();//连接数据库
ps=conn.prepareStatement(sql);// 2.创建Satement并设置参数
// rs=ps.executeQuery(); // 3.ִ执行SQL语句,緊緊用于查找語句
//sql語句中寫了幾個字段,下面就必須要有幾個字段
for(Map<String,Object> map:list){
ps.setString(1, (String)map.get("name"));
ps.setInt(2, Integer.valueOf((String)map.get("cityId")));
ps.setInt(3, Integer.valueOf((String)map.get("id")));
// 4.处理结果集
row=ps.executeUpdate();
}
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally{
try {
ps.close();
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
return row;
}
public static int addCityData(List<Map<String,Object>> list){
int row=0;
String sql="insert into tb_city(name,provinceId,jdCityId) values(?,?,?)";
try {
conn=getConnection();//连接数据库
ps=conn.prepareStatement(sql);// 2.创建Satement并设置参数
// rs=ps.executeQuery(); // 3.ִ执行SQL语句,緊緊用于查找語句
//sql語句中寫了幾個字段,下面就必須要有幾個字段
for(Map<String,Object> map:list){
ps.setString(1, (String)map.get("cityName"));
ps.setInt(2, Integer.valueOf((String)map.get("provinceId")));
ps.setInt(3, Integer.valueOf((String)map.get("cityId")));
// 4.处理结果集
row=ps.executeUpdate();
}
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally{
try {
ps.close();
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
return row;
}
public static int addProvinceData(List<Map<String,Object>> list){
int row=0;
String sql="insert into tb_province(name,provinceCode,provinceType,jdProvinceId) values(?,?,?,?)";
try {
conn=getConnection();//连接数据库
ps=conn.prepareStatement(sql);// 2.创建Satement并设置参数
// rs=ps.executeQuery(); // 3.ִ执行SQL语句,緊緊用于查找語句
//sql語句中寫了幾個字段,下面就必須要有幾個字段
for(Map<String,Object> map:list){
ps.setString(1, (String)map.get("provinceName"));
ps.setString(2, (String)map.get("provinceCode"));
ps.setInt(3, Integer.valueOf((String)map.get("provinceType")));
ps.setInt(4, Integer.valueOf((String)map.get("provinceId")));
// 4.处理结果集
row=ps.executeUpdate();
}
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally{
try {
ps.close();
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
return row;
}
private static void dataHandle(List<Map<String, Object>> provinceList,
List<Map<String, Object>> cityList,
ConcurrentLinkedQueue<Map<String, Object>> districtList) {
//1.直辖市下面的区跑到市去了
Iterator<Map<String, Object>> iterator = cityList.iterator();
while(iterator.hasNext()){
Map<String,Object> map = iterator.next();
if(Integer.valueOf((String)map.get("provinceId"))<=4){
iterator.remove();
}
}
for(int i = 0; i<4;i++){
Map<String, Object> map = provinceList.get(i);
Map<String, Object> newMap = new HashMap<String, Object>();
newMap.put("cityName", map.get("provinceName"));
newMap.put("cityId", map.get("provinceId"));
newMap.put("provinceId", map.get("provinceId"));
cityList.add(newMap);
}
}
//获取乡镇数据
private static void getCourtInfo(
ConcurrentLinkedQueue<Map<String,Object>> courtList,String districtId) {
//爬取第四级乡镇数据
String url = "https://d.jd.com/area/get?fid="+districtId;
//System.out.println(url);
try {
String request = request(url);
JSONArray jarr=JSONArray.parseArray(request);//JSON.parseArray(jsonStr);
for (Iterator iterator = jarr.iterator(); iterator.hasNext();) {
Map<String,Object> one = new HashMap<>();
JSONObject job=(JSONObject)iterator.next();
String name=job.get("name").toString();
String id = job.getString("id").toString();
one.put("name", name);
one.put("id", id);
one.put("districtId",districtId);
courtList.add(one);
}
} catch (Exception e) {
e.printStackTrace();
System.out.println("请求地址错误");
}
}
//获取区县数据
private static void getDistrictInfo(
ConcurrentLinkedQueue<Map<String,Object>> courtList,ConcurrentLinkedQueue<Map<String,Object>> districtList,
List<Map<String, Object>> cityList, AtomicInteger atoI) {
Map<String, Object> map = cityList.get(atoI.getAndIncrement());
//爬取第三级区县数据
String url = "https://d.jd.com/area/get?fid="+map.get("cityId");
//System.out.println(url);
try {
String request = request(url);
JSONArray jarr=JSONArray.parseArray(request);//JSON.parseArray(jsonStr);
for (Iterator iterator = jarr.iterator(); iterator.hasNext();) {
Map<String,Object> one = new HashMap<>();
JSONObject job=(JSONObject)iterator.next();
String name=job.get("name").toString();
String id = job.getString("id").toString();
one.put("name", name);
one.put("id", id);
one.put("cityId",map.get("cityId"));
districtList.add(one);
//
getCourtInfo(courtList,id);
}
} catch (Exception e) {
e.printStackTrace();
System.out.println("请求地址错误");
}
}
//发起请求
private static String request(String url) throws Exception {
// 定义一个缓冲字符输入流
BufferedReader in = null;
// 将string转成url对象
URL realUrl = new URL(url);
// 初始化一个链接到那个url的连接
URLConnection connection = realUrl.openConnection();
// 开始实际的连接
connection.connect();
// 初始化 BufferedReader输入流来读取URL的响应
in = new BufferedReader(new InputStreamReader(connection.getInputStream()));
String line = null;
String content = "";
while((line = in.readLine())!=null){
content+=line;
}
return content;
}
//市数据处理
private static List<Map<String,Object>> cityDataHandle(String cityStr){
String[] cityStrs = cityStr.split("hello,dy");
List<Map<String,Object>> cityList = new ArrayList<Map<String,Object>>();
for(String city:cityStrs){
String delResult = city.replace("\"", "");
String split[] = delResult.split(":");
String[] cities = split[1].split(",");
for(String str:cities){
Map<String,Object> one = new HashMap<String, Object>();
if(str.trim().isEmpty())//空白串处理
continue;
String data[] = str.split("\\|");
one.put("cityName", data[0]);
one.put("cityId", data[1]);
one.put("provinceId", split[0].replace("\t", ""));
cityList.add(one);
}
}
return cityList;
}
//读取市数据源
private static String readFile(String fileName){
File file = new File(fileName);
String content = "";
try {
BufferedReader br = new BufferedReader(new FileReader(file));
String line;
while((line=br.readLine())!=null){
content += (line+"hello,dy");
}
} catch (Exception e) {
e.printStackTrace();
}
return content.substring(0,content.length()-8);
}
//省数据处理
private static List<Map<String,Object>> provinceDataHandle(String provinceStr){
String[] proviceStrs = provinceStr.split(",");
List<Map<String,Object>> provinceList = new ArrayList<Map<String,Object>>();
for(String provice:proviceStrs){
Map<String,Object> one = new HashMap<String, Object>();
String data[] = provice.split("\\|");
one.put("provinceName", data[0]);
one.put("provinceId", data[1]);
one.put("provinceCode", data[2]);
if(data.length>3)
one.put("provinceType", data[3]);//1为直辖市
else
one.put("provinceType", "2");//2为省
provinceList.add(one);
}
return provinceList;
}
//解码规则--16进制unicode编码
public static String cover(String s){
String re = "",sub = null;
char c1,c2;
for(int i=0;i<s.length()-1;i++)
{
c1 = s.charAt(i);
c2 = s.charAt(i+1);
if(c1 == '\\' && c2 =='u'){
sub = s.substring(i+2,i+6);
re = re + (char)Integer.parseInt(sub,16);
i+=5;
}
else{
re = re+c1;
}
}
return re;
}
//NIO非阻塞式读写
@SuppressWarnings("static-access")
public static void writeByNIO(String content,File file) {
RandomAccessFile fout = null;
FileChannel fcout = null;
try {
fout = new RandomAccessFile(file, "rw");
long filelength = fout.length();//获取文件的长度
fout.seek(filelength);//将文件的读写指针定位到文件的末尾
fcout = fout.getChannel();//打开文件通道
FileLock flout = null;
while (true) {
try {
flout = fcout.tryLock();//不断的请求锁,如果请求不到,等一秒再请求
break;
} catch (Exception e) {
System.out.print("lock is exist ......");
Thread.currentThread().sleep(1000);
}
}
fout.write(content.getBytes());//将需要写入的内容写入文件
flout.release();
fcout.close();
fout.close();
} catch (IOException e1) {
e1.printStackTrace();
System.out.print("file no find ...");
} catch (InterruptedException e) {
e.printStackTrace();
} finally {
if (fcout != null) {
try {
fcout.close();
} catch (IOException e) {
e.printStackTrace();
fcout = null;
}
}
if (fout != null) {
try {
fout.close();
} catch (IOException e) {
e.printStackTrace();
fout = null;
}
}
}
}
}
能直接爬出京东的全国地址并拷贝到本地数据库中,使用的话注意数据库连接和表结构.