//【原创】本程序是把网页(php)源代码读取后,以文件txt格式存入指定路径下;然后通过批量读取txt文件,判断是否存在中文字符,若是中文字符,则会打印出来。
注:php
的url需要手动保存到AA.txt里面,也可以用自动抓取url方式,那样就方便多了。对于打印出来的中文字符,也可以以日志方法保存到文件里,方便查看。
package html;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
public class source_html {
static StringBuffer document = new
StringBuffer();
public static void main(String[] args)
{
String url =
"c://mytino//AA.txt"; // 英文网页的URL存放文件
String url_log =
"c://mytino//log//"; // 英文网页源代码存放目录
String fileURL = null;
BufferedReader in;
try {
in = new
BufferedReader(new FileReader(url));
while
((fileURL = in.readLine()) != null) {
System.out.println(fileURL);
write_log(fileURL,
url_log);
}
in.close();
} catch (IOException e) {
// TODO
Auto-generated catch block
e.printStackTrace();
}
try {
getDir(url_log);
} catch (Exception e) {
// TODO
Auto-generated catch block
e.printStackTrace();
}
}
public static void write_log(String fileURL,
String url_log)
{ // 把网页的源码文件读取出来,放入到指定目录里
try {
URL url = new
URL(fileURL);
URLConnection
conn = url.openConnection();
BufferedReader
reader = new BufferedReader(new InputStreamReader(
conn.getInputStream(),
"utf-8"));
String
newFileName = fileURL.replace("/", "!");
BufferedWriter
out = new BufferedWriter(new FileWriter(url_log
+
newFileName.substring(7) + ".txt"));
String line =
null;
while ((line
= reader.readLine()) != null) {
line
= new String(line.getBytes("utf-8"), "utf-8");
out.write(line
+ "\r\n");
}
out.close();
reader.close();
} catch (MalformedURLException
e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
static void getDir(String url_log) throws
Exception
{ // 获取所有源文件的文件名列表
try {
File f = new
File(url_log);
File[] fList
= f.listFiles();
for (int j =
0; j < fList.length; j++) {
if
(fList[j].isDirectory()) {
System.out.println("Directory
is: " + fList[j].getPath());
getDir(fList[j].getPath());
}
else {
String
name = fList[j].getPath().toString();
System.out.println("filename is: " + name);
isChinese(name);
System.out.println();
}
}
} catch (Exception e)
{
System.out.println("Error:
" + e);
}
}
private static void isChinese(String name)
throws IOException
{
String str;
String subStr1;
BufferedReader in = new
BufferedReader(new FileReader(name));
while ((str = in.readLine()) !=
null) {
for (int i
= 0; i < str.length(); i++) {
// if (str.substring(i, i + 1).matches("[\u4e00-\u9fa5]+"))
//
{ System.out.print(str.substring(i, i + 1));
// } else { System.out.println("非汉字");
} //取中文在Unicode中的编码区间判断
if(str.contains("//")
|| str.startsWith("/*" )||
str.endsWith("*")) //第二个“*”补上"/"
break; subStr1 = str.substring(i, i + 1);
if
(subStr1.getBytes().length == 2) {
System.out.print(subStr1
+ "+");
}
}
}
}
}