-
package readImgUrl;
-
-
import java.io.BufferedInputStream;
-
import java.io.BufferedReader;
-
import java.io.BufferedWriter;
-
import java.io.File;
-
import java.io.FileInputStream;
-
import java.io.FileOutputStream;
-
import java.io.FileReader;
-
import java.io.FileWriter;
-
import java.io.InputStream;
-
import java.io.InputStreamReader;
-
import java.io.OutputStreamWriter;
-
import java.net.URL;
-
import java.util.ArrayList;
-
import java.util.Arrays;
-
import java.util.Collections;
-
import java.util.Comparator;
-
import java.util.List;
-
-
public class ClassifyUrl {
-
-
private static int HASHLEN = 100;
-
-
private static String file_dir = "D:\\学习\\实验室项目\\ImageNet图片爬取\\classify_url\\";
-
-
private static String src_file = "D:\\学习\\实验室项目\\ImageNet图片爬取\\fall11_urls.txt";
-
-
public static void main(String[] args) throws Exception {
-
// TODO Auto-generated method stub
-
classify_url("D:\\学习\\实验室项目\\ImageNet图片爬取\\fall11_urls.txt");
-
// rank_filedata("2");
-
-
// String s = judgeFileCode(src_file);
-
// String s = codeString(src_file);
-
// System.out.println(s);
-
}
-
-
/**
-
* 对一个文件进行排序
-
*/
-
public static void rank_filedata(String filename){
-
String path1 = file_dir+filename+".txt";
-
String path2 = file_dir+filename+"_"+".txt";
-
List<String> list = reader_list(path1);
-
System.out.println(list.size());
-
// 排序,通过泛型和匿名类来实现
-
Collections.sort(list, new Comparator<String>() {
-
public int compare(String s1, String s2) {
-
String h1 = s1.split(" ")[1];
-
String h2 = s2.split(" ")[1];
-
return h1.compareTo(h2);
-
}
-
});
-
writer_list(list, path2);
-
}
-
/**
-
* 读取文件,返回list
-
* @param path
-
* @return
-
*/
-
public static List reader_list(String path){
-
List<String> lineList = new ArrayList();
-
try {
-
BufferedReader reader = new BufferedReader(new FileReader(path));
-
String line = reader.readLine();
-
while(null != line){
-
lineList.add(line);
-
line = reader.readLine();
-
}
-
reader.close();
-
return lineList;
-
} catch (Exception e) {
-
// TODO: handle exception
-
e.printStackTrace();
-
}
-
return null;
-
}
-
/**
-
* 将List写入文件
-
* @param line
-
*/
-
public static void writer_list(List list, String path){
-
try {
-
BufferedWriter writer = new BufferedWriter(new FileWriter(path));
-
for(int i=0; i<list.size(); i++){
-
String line = (String)list.get(i);
-
writer.write(line+"\r\n");
-
}
-
writer.close();
-
-
} catch (Exception e) {
-
// TODO: handle exception
-
e.printStackTrace();
-
}
-
}
-
/**
-
* 从文件中逐行读取数据,分类写入0-99个文件
-
*/
-
public static void classify_url(String path){
-
try {
-
BufferedReader reader ;
-
String filecode = judgeFileCode(path);
-
reader = new BufferedReader(new InputStreamReader(new FileInputStream(path),filecode));
-
// BufferedReader reader = new BufferedReader(new FileReader(path));
-
String line = reader.readLine();
-
int line_num = 0;
-
// while(line_num<4101000){
-
// reader.readLine();
-
// line_num++;
-
// }
-
while(null != line){
-
try {
-
String host = new URL(line.split(" ")[1]).getHost();
-
int type = hash(host.toCharArray());
-
// writer(type+"", line);
-
} catch (Exception e) {
-
// TODO: handle exception
-
e.printStackTrace();
-
}
-
line = reader.readLine();
-
line_num++;
-
if(line_num%100==0){
-
// System.out.println(line_num);
-
char [] cc = line.toCharArray();
-
for(char c: cc){
-
if(isCnorEn(c)){
-
System.out.println(line);
-
break;
-
}
-
}
-
// break;
-
}
-
}
-
reader.close();
-
} catch (Exception e) {
-
// TODO: handle exception
-
e.printStackTrace();
-
}
-
}
-
/**
-
* 判断是中文还是英文字符
-
*/
-
static boolean isCnorEn(char c) {
-
if ((c >= 0x0391 && c <= 0xFFE5) // 中文字符
-
|| (c >= 0x0000 && c <= 0x00FF)) // 英文字符
-
return true;
-
return false;
-
// if ((c >= 0x0391 && c <= 0xFFE5) // 英文字符
-
// ) //
-
// return true;
-
// return false;
-
}
-
/**
-
* 给定一个字符串,返回hash后的int值
-
* @param word
-
* @return
-
*/
-
public static int hash(char[] word) {
-
int index = 0;
-
int i=0;
-
while(i<word.length) {
-
index += index * 31 + word[i];
-
i++;
-
}
-
return Math.abs(index % HASHLEN);
-
}
-
/**
-
* 将line写入filename中(文件不存在则先建立)
-
* @param filename
-
* @param line
-
*/
-
public static void writer(String filename, String line){
-
String path = file_dir+filename+".txt";
-
try {
-
File file = new File(path);
-
if(!file.isFile()){
-
file.createNewFile();
-
}
-
String filecode = judgeFileCode(src_file);
-
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(path, true), "GBK");
-
// BufferedWriter writer = new BufferedWriter(new FileWriter(path, true));
-
if(null != line){
-
writer.write(line+"\r\n");
-
}
-
writer.close();
-
-
} catch (Exception e) {
-
// TODO: handle exception
-
e.printStackTrace();
-
}
-
}
-
-
public static String judgeFileCode(String path){
-
try {
-
File file = new File(path);
-
InputStream in= new java.io.FileInputStream(file);
-
byte[] b = new byte[3];
-
in.read(b);
-
in.close();
-
if (b[0] == -17 && b[1] == -69 && b[2] == -65) {
-
// System.out.println(file.getName() + ":编码为UTF-8");
-
return "UTF-8";
-
}
-
else{
-
// System.out.println(file.getName() + ":可能是GBK,也可能是其他编码");
-
return "GBK";
-
}
-
} catch (Exception e) {
-
// TODO: handle exception
-
}
-
return null;
-
}
-
-
/**
-
* 判断文件的编码格式
-
* @param fileName :file
-
* @return 文件编码格式
-
* @throws Exception
-
*/
-
public static String codeString(String fileName) throws Exception{
-
BufferedInputStream bin = new BufferedInputStream(new FileInputStream(fileName));
-
int p = (bin.read() << 8) + bin.read();
-
String code = null;
-
//其中的 0xefbb、0xfffe、0xfeff、0x5c75这些都是这个文件的前面两个字节的16进制数
-
switch (p) {
-
case 0xefbb:
-
code = "UTF-8";
-
break;
-
case 0xfffe:
-
code = "Unicode";
-
break;
-
case 0xfeff:
-
code = "UTF-16BE";
-
break;
-
case 0x5c75:
-
code = "ANSI|ASCII" ;
-
break ;
-
default:
-
code = "GBK";
-
}
-
-
return code;
-
}
-
-
}
使用hash拆分文件
最新推荐文章于 2023-11-28 14:29:14 发布