package com.zhanzhang.tools;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLEncoder;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
public class 去广告链接 {
final static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
public static String parse302URL(String url) {
String location = null;
try {
URL serverUrl = new URL(url);
HttpURLConnection conn = (HttpURLConnection) serverUrl
.openConnection();
conn.setRequestMethod("GET");
conn.setConnectTimeout(10000);
conn.setReadTimeout(10000);
// 必须设置false,否则会自动redirect到Location的地址
conn.setInstanceFollowRedirects(false);
conn.addRequestProperty("Accept-Charset", "UTF-8;");
conn.addRequestProperty(
"User-Agent",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36");
conn.addRequestProperty("Referer", "http://www.zuidaima.com/");
conn.connect();
location = conn.getHeaderField("Location");
conn.disconnect();
} catch (Exception e) {
e.printStackTrace();
}
return location;
}
public static String request(String url) {
StringBuffer res = new StringBuffer();
HttpURLConnection conn = null;
try {
URL serverUrl = new URL(url);
conn = (HttpURLConnection) serverUrl.openConnection();
conn.setRequestMethod("GET");
conn.setConnectTimeout(10000);
conn.setReadTimeout(10000);
conn.addRequestProperty("Accept-Charset", "UTF-8;");
conn.addRequestProperty(
"User-Agent",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36");
conn.addRequestProperty("Referer", "http://www.zuidaima.com/");
conn.connect();
InputStream ins = conn.getInputStream();
String charset = "UTF-8";
InputStreamReader inr = new InputStreamReader(ins, charset);
BufferedReader bfr = new BufferedReader(inr);
String line = "";
do {
res.append(line);
line = bfr.readLine();
} while (line != null);
inr.close();
bfr.close();
} catch (Exception e) {
System.out.println("###error:" + e.getMessage() + " at "
+ new Date());
if (e.toString().indexOf("FileNotFound") != -1) {
res.append("404");
}
} finally {
if (conn != null) {
conn.disconnect();
}
}
return res.toString();
}
public static List parseWebNavs(String html) {
List webs = new ArrayList();
Pattern pattern = Pattern
.compile("
([\\s\\S]*?)");
Matcher matcher = pattern.matcher(html);
while (matcher.find()) {
String _url = matcher.group(1);
String name = matcher.group(2);
name = name.replaceAll("<.>", " ").trim();
String url = parse302URL(_url);
if (url == null) {
url = _url;
}
if (webs.contains(url)) {
continue;
}
Web web = new Web(url, name);
webs.add(web);
}
return webs;
}
public static List parseURL(String keyword) throws IOException {
String _keyword = URLEncoder.encode("site:zuidaima.com " + keyword,
"utf-8");
List webs = new ArrayList();
int p = 1;
int s = 10;
while (true) {
int pn = (p - 1) * s;
String url = "http://www.baidu.com/s?wd=%s&pn=%s&ie=utf-8&usm=1&rsv_page=1";
System.out.println("Start to parse " + keyword + " " + p);
String _url = String.format(url, _keyword, pn + "");
System.out.println(p + " Request url " + _url);
String html = request(_url);
List _webs = parseWebNavs(html);
for (Web web : _webs) {
if (!request(web.getHome()).equals("404")) {
System.out.println("过滤掉:" + web.getName());
continue;
}
if (webs.contains(web)) {
continue;
}
webs.add(web);
}
if (html.indexOf("下一页") == -1) {
break;
}
p++;
try {
Thread.sleep(3000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
return webs;
}
public static Element createUrlElement(Document document, String loc,
String priority, String lastmod, String changefreq) {
Element element = document.createElement("url");
Element locElement = document.createElement("loc");
locElement.appendChild(document.createTextNode(loc));
element.appendChild(locElement);
Element priorityElement = document.createElement("priority");
priorityElement.appendChild(document.createTextNode(priority));
element.appendChild(priorityElement);
Element lastmodElement = document.createElement("lastmod");
lastmodElement.appendChild(document.createTextNode(lastmod));
element.appendChild(lastmodElement);
Element changefreqElement = document.createElement("changefreq");
changefreqElement.appendChild(document.createTextNode(changefreq));
element.appendChild(changefreqElement);
return element;
}
public static void main(String[] args) throws Exception {
List webs = new ArrayList();
String keywords = "学校,学院,办理,毕业证";
String[] _keywords = keywords.split(",");
for (String keyword : _keywords) {
List _webs = parseURL(keyword);
webs.addAll(_webs);
}
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
Document document = builder.newDocument();
Element root = document.createElement("urlset");
document.appendChild(root);
Date now = new Date();
String loc = null;
String priority = null;
String lastmod = null;
String changefreq = null;
for (Web web : webs) {
loc = web.getHome();
priority = "0.8";
lastmod = sdf.format(now);
changefreq = "daily";
Element shareElement = createUrlElement(document, loc, priority,
lastmod, changefreq);
root.appendChild(shareElement);
}
File file = new File("c:/sitemap_trash.xml");
if (!file.getParentFile().exists()) {
file.mkdirs();
}
if (file.exists()) {
file.delete();
} else {
file.createNewFile();
}
TransformerFactory tf = TransformerFactory.newInstance();
Transformer transformer = tf.newTransformer();
DOMSource source = new DOMSource(document);
transformer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
PrintWriter pw = new PrintWriter(new FileOutputStream(file));
StreamResult result = new StreamResult(pw);
transformer.transform(source, result);
//String html = request("http://www.zuidaima.com/share/k%E5%8A%9E%E7%90%86%E4%B8%8A%E6%B5%B7%E4%B8%AD%E4%BE%A8%E8%81%8C%E4%B8%9A%E6%8A%80%E6%9C%AF%E5%AD%A6%E9%99%A2%E6%96%87%E5%87%AD%E3%80%90%E8%81%94%E7%B3%BBQQ%EF%BC%9A931957539%E3%80%91%E5%8A%9E%E7%90%86%E4%B8%8A%E6%B5%B7%E4%B8%AD%E4%BE%A8%E8%81%8C%E4%B8%9A%E6%8A%80%E6%9C%AF%E5%AD%A6%E9%99%A2%E6%96%87%E5%87%AD%E2%98%852014%E5%B9%B412%E6%9C%8829%E6%97%A5roab9f-p1-s1.htm");
//System.out.println(html);
}
}