import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.Properties;
import java.util.Set;
import java.util.Vector;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Semaphore;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;
import com.fish.framework.constant.Charsets;
import com.fish.framework.constant.ProperKeys;
import com.fish.util.ReadToWriteIO;
import com.fish.util.StringUtil;
public class GetHtml {
private static ExecutorService executorService = null;
private static Semaphore semaphore = null;
private static PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
private static NetworkManagement nm = new NetworkManagement();
private static int threadPool = 100;
private static int maxTotal = 200;
private static int successCount = 0;
private static int tableEverySize = 100;
private static ConcurrentHashMap<String, DetailData> baseDataList = new ConcurrentHashMap<String, DetailData>();
private static Pattern tablePattern;
private static String get_stock_quotation_path;
/**
* 加载配置文件
*/
static {
Properties proper = new Properties();
try {
proper.load(new FileInputStream(Thread.currentThread().getContextClassLoader().getResource("proper.properties").getPath()));
tablePattern = Pattern.compile(proper.getProperty(ProperKeys.REGEX_STOCK_QUOTATION));
get_stock_quotation_path = proper.getProperty(ProperKeys.GET_STOCK_QUOTATION_PATH);
executorService = Executors.newFixedThreadPool(threadPool);//创建线程
semaphore = new Semaphore(threadPool);//用来控制同时访问特定资源的线程数量
nm.isSpontaneousNotice(false);
} catch (Exception e) {
throw new RuntimeException("properties load fail!");
}
}
public static void main(String[] args) throws InterruptedException, IOException {
execution();
}
private static void execution() throws InterruptedException, IOException{
CloseableHttpClient httpclient = null;
try {
cm.setMaxTotal(maxTotal);
cm.setDefaultMaxPerRoute(20);
httpclient = HttpClients.custom().setConnectionManager(cm).build();
long startl = System.currentTimeMillis();
executionDetail(httpclient);
long endl = System.currentTimeMillis();
System.out.println(tableEverySize + "条线程执行时间共:"+ ((endl - startl) / 1000) + "秒\t");
} finally {
executorService.shutdown();
cm.close();
if (httpclient != null) {
try {
httpclient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
/**
* 初次详细数据抓取
*/
public static void executionDetail(CloseableHttpClient httpclient)
throws InterruptedException, IOException {
File file = new File("E:\\test\\html\\stock\\symbol_type.csv");
Vector<String> vList = ReadToWriteIO.readStockSymbolToFile(file, Charsets.UTF8);
int vListSize = vList.size();
System.out.println(vListSize + " " + tableEverySize);
int start = 0;
int len = vListSize % tableEverySize == 0 ? vListSize / tableEverySize
: vListSize / tableEverySize + 1;
boolean hasMantissa = vListSize % tableEverySize == 0 ? false : true;
int mantissaSize = vListSize % tableEverySize;
int dataCount = 0;
System.out.println("共" + vListSize + "条数据,将分" + len + "批经行获取,开始抓取....");
for (int i = 0; i < len; i++) {
successCount = 0;
if ((i + 1) % 100 == 0) {
cm.close();
httpclient.close();
cm = new PoolingHttpClientConnectionManager();
httpclient = HttpClients.custom().setConnectionManager(cm)
.build();
}
cm.closeExpiredConnections();
System.out.println("\t\t发送第" + (i + 1) + "批次content请求");
Thread.sleep(500);
if ((i + 1) % 100 == 0) {
cm.close();
httpclient.close();
cm = new PoolingHttpClientConnectionManager();
httpclient = HttpClients.custom().setConnectionManager(cm)
.build();
}
if (hasMantissa && i == len - 1) {
// 尾数部分
for (int j = 0; j < mantissaSize; j++) {
dataCount++;
String symbol_type = vList.get(start + j);
if (symbol_type != null && symbol_type != "") {
baseDataList.put(symbol_type, new DetailData(symbol_type));
executorService.execute(new GetDetailThread(httpclient,
new HttpGet(get_stock_quotation_path + symbol_type),
symbol_type));
try {
Thread.sleep(50);
} catch (Exception e) {
e.printStackTrace();
}
}
}
} else {
// 整体部分
for (int j = 0; j < tableEverySize; j++) {
dataCount++;
String symbol_type = vList.get(start + j);
if (symbol_type != null && symbol_type != "") {
baseDataList.put(symbol_type, new DetailData(symbol_type));
executorService.execute(new GetDetailThread(httpclient,
new HttpGet(get_stock_quotation_path + symbol_type),
symbol_type));
try {
Thread.sleep(50);
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
int time = 0;
synchronized (GetHtml.class) {
do {
if (++time == 10) {
executorService.shutdownNow();
executorService = Executors
.newFixedThreadPool(threadPool);
semaphore.drainPermits();
semaphore = new Semaphore(threadPool);
Thread.sleep(2000);
break;
}
if (semaphore.availablePermits() != threadPool) {
try {
Thread.sleep(1000);
System.out.println("\t\t已等待" + time + "秒,已获取有效许可"
+ semaphore.availablePermits() + "个");
} catch (Exception e) {
e.printStackTrace();
}
} else {
break;
}
} while (true);
}
if (successCount == 0) {
try {
System.out.println("&&&&& 重新实例化HttpClient &&&&&");
cm.close();
httpclient.close();
cm = new PoolingHttpClientConnectionManager();
cm.setMaxTotal(maxTotal);
cm.setDefaultMaxPerRoute(20);
httpclient = HttpClients.custom().setConnectionManager(cm)
.build();
Thread.sleep(10000);
} catch (Exception e) {
System.out.println("&&&&& catch 正在重新实例化.... &&&&&");
}
}
System.out.println("---- sp.availablePermits():"
+ semaphore.availablePermits());
System.out.println("\t\t结束第" + (i + 1) + "批次content请求");
if (dataCount % 500 == 0) {
try {
Thread.sleep(2000);
writeBaseDataToFile(baseDataList);
} catch (Exception e) {
e.printStackTrace();
}
baseDataList.clear();
}
start += tableEverySize;
}
try {
Thread.sleep(5000);
} catch (InterruptedException e) {
e.printStackTrace();
}
writeBaseDataToFile(baseDataList);
baseDataList.clear();
}
private static void writeBaseDataToFile(
ConcurrentHashMap<String, DetailData> baseDataList2) {
String writePath = "E:\\test\\html\\stock\\test2";
File f = new File(writePath);
if (!f.exists()) {
f.mkdirs();
}
String okFileName = writePath + "/" + "okquotation.csv";
File okFile = new File(okFileName);
if (!okFile.exists()) {
try {
okFile.createNewFile();
} catch (IOException e) {
e.printStackTrace();
}
}
String noFileName = writePath + "/" + "noquotation.csv";
File noFile = new File(noFileName);
if (!noFile.exists()) {
try {
noFile.createNewFile();
} catch (IOException e) {
e.printStackTrace();
}
}
writeToFile(baseDataList2, okFile, noFile, Charsets.UTF8);
}
/**
* 详细数据写入文件
*/
public synchronized static void writeToFile(
ConcurrentHashMap<String, DetailData> bdList, File okFile,
File noFile, String encoding) {
BufferedWriter bufOk = null;
BufferedWriter bufNo = null;
try {
bufOk = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(okFile, true), encoding));
bufNo = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(noFile, true), encoding));
synchronized (bdList) {
Set<String> set = bdList.keySet();
int i = 0;
for (String s : set) {
i++;
String str = bdList.get(s).getValue();
if (str != null && str != "") {
bufOk.write(str);
} else {
bufNo.write(s);
bufNo.newLine();
}
if (i % 1000 == 0) {
bufNo.flush();
}
}
}
bufOk.flush();
bufOk.close();
bufNo.flush();
bufNo.close();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (bufOk != null) {
try {
bufOk.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (bufNo != null) {
try {
bufNo.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
static class GetDetailThread implements Runnable{
private final CloseableHttpClient httpClient;
private final HttpContext context;
private final HttpGet httpget;
private final String id;
public GetDetailThread(CloseableHttpClient httpClient, HttpGet httpget,
String id) {
this.httpClient = httpClient;
this.context = new BasicHttpContext();
this.httpget = httpget;
this.id = id;
}
@Override
public void run() {
CloseableHttpResponse response = null;
synchronized (this) {
try {
semaphore.acquire();
} catch (InterruptedException e1) {
e1.printStackTrace();
}
try {
response = httpClient.execute(httpget, context);
int status = response.getStatusLine().getStatusCode();
if (status >= 200 && status < 300) {
HttpEntity httpEntity = response.getEntity();
if (httpEntity != null) {
String cont = trimLineToString(httpEntity, "utf-8");
System.out.println("-------html-------" + cont);
EntityUtils.consume(httpEntity);
Matcher matcher = tablePattern.matcher(cont);
if (matcher.find()) {
String info = matcher.group(1);
String[] strs = new String[33];
strs = StringUtil.split(info, ",");
StringBuffer buff = new StringBuffer();
buff.append("\"" + strs[1] + "\",");
buff.append("\"" + (strs[28].split(" "))[0] + "\",");
buff.append("\"" + strs[5] + "\",");
buff.append("\"" + strs[10] + "\",");
buff.append("\"" + strs[11] + "\",");
buff.append("\"" + strs[13] + "\",");
buff.append("\"" + strs[9] + "\",");
buff.append("\"" + strs[8] + "\",");
buff.append("\"" + strs[3] + "\",");
buff.append("\"" + strs[4] + "\",");
buff.append("\"" + strs[6] + "\",");
buff.append("\"" + strs[7] + "\",");
buff.append("\"" + strs[23] + "\",");
buff.append("\"" + strs[22] + "\",");
buff.append("\"" + strs[24] + "\",");
buff.deleteCharAt(buff.lastIndexOf(","));
buff.append("\r\n");
successCount++;
baseDataList.get(id).setValue(
"\"" + id + "\"," + buff.toString());
System.out
.println(" id:" + id + " " + "\t抓取成功");
} else {
System.out.println(" id:" + id + " "
+ "\t抓取失败,响应长度:"
+ httpEntity.getContentLength());
}
}
}
semaphore.release();
} catch (Exception e) {
this.httpget.abort();
System.out.println(id + " - error: " + e);
} finally {
if (response != null) {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (httpget != null)
httpget.releaseConnection();
}
}
}
public synchronized String trimLineToString(HttpEntity entiry,
String charset) {
StringBuffer sb = new StringBuffer();
BufferedReader reader = null;
try {
InputStream instream = entiry.getContent();
reader = new BufferedReader(new InputStreamReader(instream,
charset));
String str = null;
while ((str = reader.readLine()) != null) {
if (str.trim().length() == 0) {
} else if (str.trim().contains("<!--")) {
} else {
sb.append(str.trim());
}
}
instream.close();
} catch (IllegalStateException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return sb.toString();
}
}
}
package com.fish.net;
public class DetailData {
private String key;
private String value = "";
public DetailData() {
super();
}
public DetailData(String key) {
super();
this.key = key;
}
public DetailData(String key, String value) {
super();
this.key = key;
this.value = value;
}
public String getKey() {
return key;
}
public void setKey(String key) {
this.key = key;
}
public String getValue() {
return value;
}
public void setValue(String value) {
this.value = value;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((key == null) ? 0 : key.hashCode());
result = prime * result + ((value == null) ? 0 : value.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
DetailData other = (DetailData) obj;
if (key == null) {
if (other.key != null)
return false;
} else if (!key.equals(other.key))
return false;
if (value == null) {
if (other.value != null)
return false;
} else if (!value.equals(other.value))
return false;
return true;
}
}
package com.fish.net;
import java.awt.Toolkit;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
/**
* @Description:本类开启一个线程检测网络是否连通
*/
public class NetworkManagement implements Runnable {
private int htmlCodeSize;
private int sleepMillisecond;
private int sleepMillisecondWhenNetWorkUnLinked;
private boolean isSpontaneousNotice;
private static boolean networkIsLinked;
private Thread thread = new Thread(this);
private Toolkit toolkit;
private String[] urls;
public NetworkManagement() {
this.urls = new String[] { "http://www.baidu.com",
"http://www.google.cn" };
this.htmlCodeSize = 50;
this.sleepMillisecond = 5000;
this.sleepMillisecondWhenNetWorkUnLinked = 5000;
this.toolkit = Toolkit.getDefaultToolkit();
thread.start();
}
public void setURLs(String[] urls) {
if (urls != null && urls.length > 0) {
this.urls = urls;
}
}
public void setHtmlCodeSize(int htmlCodeSize) {
if (htmlCodeSize > 0) {
this.htmlCodeSize = htmlCodeSize;
}
}
public void isSpontaneousNotice(boolean isSpontaneousNotice) {
this.isSpontaneousNotice = isSpontaneousNotice;
}
public void setSleepMillisecont(int sleepMillisecont) {
if (sleepMillisecont > 100) {
this.sleepMillisecond = sleepMillisecont;
}
}
public void setSleepMillisecondWhenNetWorkUnLinked(int sleepMillisecont) {
if (sleepMillisecont > 100) {
this.sleepMillisecondWhenNetWorkUnLinked = sleepMillisecont;
}
}
public static boolean IsNetWordLinking() {
return NetworkManagement.networkIsLinked;
}
public void run() {
while (true) {
try {
this.isNetWorkLinked();
if (!NetworkManagement.networkIsLinked) {
this.isPrintMessage(this.isSpontaneousNotice);
Thread.sleep(this.sleepMillisecondWhenNetWorkUnLinked);
}
// System.out.println(NetworkManagement.IsNetWordLinking());
Thread.sleep(this.sleepMillisecond);
} catch (Exception e) {
}
}
}
private boolean canGetHtmlCode(String httpUrl) {
String htmlCode = "";
try {
InputStream in;
URL url = new java.net.URL(httpUrl);
HttpURLConnection connection = (HttpURLConnection) url
.openConnection();
connection = (HttpURLConnection) url.openConnection();
connection.setRequestProperty("User-Agent", "Mozilla/4.0");
connection.connect();
in = connection.getInputStream();
byte[] buffer = new byte[this.htmlCodeSize];
in.read(buffer);
htmlCode = new String(buffer);
} catch (Exception e) {
}
if (htmlCode == null || htmlCode.equals("")) {
return false;
}
return true;
}
private void isNetWorkLinked() {
boolean tempIsNetWorkLinked = false;
for (int urlsCount = 0; urlsCount < this.urls.length; urlsCount++) {
if (this.canGetHtmlCode(this.urls[urlsCount])) {
tempIsNetWorkLinked = true;
break;
}
}
NetworkManagement.networkIsLinked = tempIsNetWorkLinked;
}
private void isPrintMessage(boolean isPrint) {
if (isPrint) {
toolkit.beep();
StringBuffer message = new StringBuffer();
message.append("------------->");
message.append("网络中断, ");
message.append(this.sleepMillisecondWhenNetWorkUnLinked);
message.append(" 毫秒后再次检测!<-------------");
System.out.println(message.toString());
}
}
public static void main(String[] args) {
NetworkManagement n = new NetworkManagement();
n.isSpontaneousNotice(false);
}
}