html广告恶意植入,最代码网站的链接被垃圾广告链接恶意提交到百度收录后的经验和代码片段分享...

package com.zhanzhang.tools;

import java.io.BufferedReader;

import java.io.File;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.io.PrintWriter;

import java.net.HttpURLConnection;

import java.net.URL;

import java.net.URLEncoder;

import java.text.SimpleDateFormat;

import java.util.ArrayList;

import java.util.Date;

import java.util.List;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

import javax.xml.parsers.DocumentBuilder;

import javax.xml.parsers.DocumentBuilderFactory;

import javax.xml.transform.OutputKeys;

import javax.xml.transform.Transformer;

import javax.xml.transform.TransformerFactory;

import javax.xml.transform.dom.DOMSource;

import javax.xml.transform.stream.StreamResult;

import org.w3c.dom.Document;

import org.w3c.dom.Element;

public class 去广告链接 {

final static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");

public static String parse302URL(String url) {

String location = null;

try {

URL serverUrl = new URL(url);

HttpURLConnection conn = (HttpURLConnection) serverUrl

.openConnection();

conn.setRequestMethod("GET");

conn.setConnectTimeout(10000);

conn.setReadTimeout(10000);

// 必须设置false,否则会自动redirect到Location的地址

conn.setInstanceFollowRedirects(false);

conn.addRequestProperty("Accept-Charset", "UTF-8;");

conn.addRequestProperty(

"User-Agent",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36");

conn.addRequestProperty("Referer", "http://www.zuidaima.com/");

conn.connect();

location = conn.getHeaderField("Location");

conn.disconnect();

} catch (Exception e) {

e.printStackTrace();

}

return location;

}

public static String request(String url) {

StringBuffer res = new StringBuffer();

HttpURLConnection conn = null;

try {

URL serverUrl = new URL(url);

conn = (HttpURLConnection) serverUrl.openConnection();

conn.setRequestMethod("GET");

conn.setConnectTimeout(10000);

conn.setReadTimeout(10000);

conn.addRequestProperty("Accept-Charset", "UTF-8;");

conn.addRequestProperty(

"User-Agent",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36");

conn.addRequestProperty("Referer", "http://www.zuidaima.com/");

conn.connect();

InputStream ins = conn.getInputStream();

String charset = "UTF-8";

InputStreamReader inr = new InputStreamReader(ins, charset);

BufferedReader bfr = new BufferedReader(inr);

String line = "";

do {

res.append(line);

line = bfr.readLine();

} while (line != null);

inr.close();

bfr.close();

} catch (Exception e) {

System.out.println("###error:" + e.getMessage() + " at "

+ new Date());

if (e.toString().indexOf("FileNotFound") != -1) {

res.append("404");

}

} finally {

if (conn != null) {

conn.disconnect();

}

}

return res.toString();

}

public static List parseWebNavs(String html) {

List webs = new ArrayList();

Pattern pattern = Pattern

.compile("

([\\s\\S]*?)");

Matcher matcher = pattern.matcher(html);

while (matcher.find()) {

String _url = matcher.group(1);

String name = matcher.group(2);

name = name.replaceAll("<.>", " ").trim();

String url = parse302URL(_url);

if (url == null) {

url = _url;

}

if (webs.contains(url)) {

continue;

}

Web web = new Web(url, name);

webs.add(web);

}

return webs;

}

public static List parseURL(String keyword) throws IOException {

String _keyword = URLEncoder.encode("site:zuidaima.com " + keyword,

"utf-8");

List webs = new ArrayList();

int p = 1;

int s = 10;

while (true) {

int pn = (p - 1) * s;

String url = "http://www.baidu.com/s?wd=%s&pn=%s&ie=utf-8&usm=1&rsv_page=1";

System.out.println("Start to parse " + keyword + " " + p);

String _url = String.format(url, _keyword, pn + "");

System.out.println(p + " Request url " + _url);

String html = request(_url);

List _webs = parseWebNavs(html);

for (Web web : _webs) {

if (!request(web.getHome()).equals("404")) {

System.out.println("过滤掉:" + web.getName());

continue;

}

if (webs.contains(web)) {

continue;

}

webs.add(web);

}

if (html.indexOf("下一页") == -1) {

break;

}

p++;

try {

Thread.sleep(3000);

} catch (InterruptedException e) {

e.printStackTrace();

}

}

return webs;

}

public static Element createUrlElement(Document document, String loc,

String priority, String lastmod, String changefreq) {

Element element = document.createElement("url");

Element locElement = document.createElement("loc");

locElement.appendChild(document.createTextNode(loc));

element.appendChild(locElement);

Element priorityElement = document.createElement("priority");

priorityElement.appendChild(document.createTextNode(priority));

element.appendChild(priorityElement);

Element lastmodElement = document.createElement("lastmod");

lastmodElement.appendChild(document.createTextNode(lastmod));

element.appendChild(lastmodElement);

Element changefreqElement = document.createElement("changefreq");

changefreqElement.appendChild(document.createTextNode(changefreq));

element.appendChild(changefreqElement);

return element;

}

public static void main(String[] args) throws Exception {

List webs = new ArrayList();

String keywords = "学校,学院,办理,毕业证";

String[] _keywords = keywords.split(",");

for (String keyword : _keywords) {

List _webs = parseURL(keyword);

webs.addAll(_webs);

}

DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();

DocumentBuilder builder = factory.newDocumentBuilder();

Document document = builder.newDocument();

Element root = document.createElement("urlset");

document.appendChild(root);

Date now = new Date();

String loc = null;

String priority = null;

String lastmod = null;

String changefreq = null;

for (Web web : webs) {

loc = web.getHome();

priority = "0.8";

lastmod = sdf.format(now);

changefreq = "daily";

Element shareElement = createUrlElement(document, loc, priority,

lastmod, changefreq);

root.appendChild(shareElement);

}

File file = new File("c:/sitemap_trash.xml");

if (!file.getParentFile().exists()) {

file.mkdirs();

}

if (file.exists()) {

file.delete();

} else {

file.createNewFile();

}

TransformerFactory tf = TransformerFactory.newInstance();

Transformer transformer = tf.newTransformer();

DOMSource source = new DOMSource(document);

transformer.setOutputProperty(OutputKeys.ENCODING, "utf-8");

transformer.setOutputProperty(OutputKeys.INDENT, "yes");

PrintWriter pw = new PrintWriter(new FileOutputStream(file));

StreamResult result = new StreamResult(pw);

transformer.transform(source, result);

//String html = request("http://www.zuidaima.com/share/k%E5%8A%9E%E7%90%86%E4%B8%8A%E6%B5%B7%E4%B8%AD%E4%BE%A8%E8%81%8C%E4%B8%9A%E6%8A%80%E6%9C%AF%E5%AD%A6%E9%99%A2%E6%96%87%E5%87%AD%E3%80%90%E8%81%94%E7%B3%BBQQ%EF%BC%9A931957539%E3%80%91%E5%8A%9E%E7%90%86%E4%B8%8A%E6%B5%B7%E4%B8%AD%E4%BE%A8%E8%81%8C%E4%B8%9A%E6%8A%80%E6%9C%AF%E5%AD%A6%E9%99%A2%E6%96%87%E5%87%AD%E2%98%852014%E5%B9%B412%E6%9C%8829%E6%97%A5roab9f-p1-s1.htm");

//System.out.println(html);

}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值