使用hash拆分文件

  1. package readImgUrl;
  2. import java.io.BufferedInputStream;
  3. import java.io.BufferedReader;
  4. import java.io.BufferedWriter;
  5. import java.io.File;
  6. import java.io.FileInputStream;
  7. import java.io.FileOutputStream;
  8. import java.io.FileReader;
  9. import java.io.FileWriter;
  10. import java.io.InputStream;
  11. import java.io.InputStreamReader;
  12. import java.io.OutputStreamWriter;
  13. import java.net.URL;
  14. import java.util.ArrayList;
  15. import java.util.Arrays;
  16. import java.util.Collections;
  17. import java.util.Comparator;
  18. import java.util.List;
  19. public class ClassifyUrl {
  20. private static int HASHLEN = 100;
  21. private static String file_dir = "D:\\学习\\实验室项目\\ImageNet图片爬取\\classify_url\\";
  22. private static String src_file = "D:\\学习\\实验室项目\\ImageNet图片爬取\\fall11_urls.txt";
  23. public static void main(String[] args) throws Exception {
  24. // TODO Auto-generated method stub
  25. classify_url("D:\\学习\\实验室项目\\ImageNet图片爬取\\fall11_urls.txt");
  26. // rank_filedata("2");
  27. // String s = judgeFileCode(src_file);
  28. // String s = codeString(src_file);
  29. // System.out.println(s);
  30. }
  31. /**
  32. * 对一个文件进行排序
  33. */
  34. public static void rank_filedata(String filename){
  35. String path1 = file_dir+filename+".txt";
  36. String path2 = file_dir+filename+"_"+".txt";
  37. List<String> list = reader_list(path1);
  38. System.out.println(list.size());
  39. // 排序,通过泛型和匿名类来实现
  40. Collections.sort(list, new Comparator<String>() {
  41. public int compare(String s1, String s2) {
  42. String h1 = s1.split(" ")[1];
  43. String h2 = s2.split(" ")[1];
  44. return h1.compareTo(h2);
  45. }
  46. });
  47. writer_list(list, path2);
  48. }
  49. /**
  50. * 读取文件,返回list
  51. * @param path
  52. * @return
  53. */
  54. public static List reader_list(String path){
  55. List<String> lineList = new ArrayList();
  56. try {
  57. BufferedReader reader = new BufferedReader(new FileReader(path));
  58. String line = reader.readLine();
  59. while(null != line){
  60. lineList.add(line);
  61. line = reader.readLine();
  62. }
  63. reader.close();
  64. return lineList;
  65. } catch (Exception e) {
  66. // TODO: handle exception
  67. e.printStackTrace();
  68. }
  69. return null;
  70. }
  71. /**
  72. * 将List写入文件
  73. * @param line
  74. */
  75. public static void writer_list(List list, String path){
  76. try {
  77. BufferedWriter writer = new BufferedWriter(new FileWriter(path));
  78. for(int i=0; i<list.size(); i++){
  79. String line = (String)list.get(i);
  80. writer.write(line+"\r\n");
  81. }
  82. writer.close();
  83. } catch (Exception e) {
  84. // TODO: handle exception
  85. e.printStackTrace();
  86. }
  87. }
  88. /**
  89. * 从文件中逐行读取数据,分类写入0-99个文件
  90. */
  91. public static void classify_url(String path){
  92. try {
  93. BufferedReader reader ;
  94. String filecode = judgeFileCode(path);
  95. reader = new BufferedReader(new InputStreamReader(new FileInputStream(path),filecode));
  96. // BufferedReader reader = new BufferedReader(new FileReader(path));
  97. String line = reader.readLine();
  98. int line_num = 0;
  99. // while(line_num<4101000){
  100. // reader.readLine();
  101. // line_num++;
  102. // }
  103. while(null != line){
  104. try {
  105. String host = new URL(line.split(" ")[1]).getHost();
  106. int type = hash(host.toCharArray());
  107. // writer(type+"", line);
  108. } catch (Exception e) {
  109. // TODO: handle exception
  110. e.printStackTrace();
  111. }
  112. line = reader.readLine();
  113. line_num++;
  114. if(line_num%100==0){
  115. // System.out.println(line_num);
  116. char [] cc = line.toCharArray();
  117. for(char c: cc){
  118. if(isCnorEn(c)){
  119. System.out.println(line);
  120. break;
  121. }
  122. }
  123. // break;
  124. }
  125. }
  126. reader.close();
  127. } catch (Exception e) {
  128. // TODO: handle exception
  129. e.printStackTrace();
  130. }
  131. }
  132. /**
  133. * 判断是中文还是英文字符
  134. */
  135. static boolean isCnorEn(char c) {
  136. if ((c >= 0x0391 && c <= 0xFFE5) // 中文字符
  137. || (c >= 0x0000 && c <= 0x00FF)) // 英文字符
  138. return true;
  139. return false;
  140. // if ((c >= 0x0391 && c <= 0xFFE5) // 英文字符
  141. // ) //
  142. // return true;
  143. // return false;
  144. }
  145. /**
  146. * 给定一个字符串,返回hash后的int值
  147. * @param word
  148. * @return
  149. */
  150. public static int hash(char[] word) {
  151. int index = 0;
  152. int i=0;
  153. while(i<word.length) {
  154. index += index * 31 + word[i];
  155. i++;
  156. }
  157. return Math.abs(index % HASHLEN);
  158. }
  159. /**
  160. * 将line写入filename中(文件不存在则先建立)
  161. * @param filename
  162. * @param line
  163. */
  164. public static void writer(String filename, String line){
  165. String path = file_dir+filename+".txt";
  166. try {
  167. File file = new File(path);
  168. if(!file.isFile()){
  169. file.createNewFile();
  170. }
  171. String filecode = judgeFileCode(src_file);
  172. OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(path, true), "GBK");
  173. // BufferedWriter writer = new BufferedWriter(new FileWriter(path, true));
  174. if(null != line){
  175. writer.write(line+"\r\n");
  176. }
  177. writer.close();
  178. } catch (Exception e) {
  179. // TODO: handle exception
  180. e.printStackTrace();
  181. }
  182. }
  183. public static String judgeFileCode(String path){
  184. try {
  185. File file = new File(path);
  186. InputStream in= new java.io.FileInputStream(file);
  187. byte[] b = new byte[3];
  188. in.read(b);
  189. in.close();
  190. if (b[0] == -17 && b[1] == -69 && b[2] == -65) {
  191. // System.out.println(file.getName() + ":编码为UTF-8");
  192. return "UTF-8";
  193. }
  194. else{
  195. // System.out.println(file.getName() + ":可能是GBK,也可能是其他编码");
  196. return "GBK";
  197. }
  198. } catch (Exception e) {
  199. // TODO: handle exception
  200. }
  201. return null;
  202. }
  203. /**
  204. * 判断文件的编码格式
  205. * @param fileName :file
  206. * @return 文件编码格式
  207. * @throws Exception
  208. */
  209. public static String codeString(String fileName) throws Exception{
  210. BufferedInputStream bin = new BufferedInputStream(new FileInputStream(fileName));
  211. int p = (bin.read() << 8) + bin.read();
  212. String code = null;
  213. //其中的 0xefbb、0xfffe、0xfeff、0x5c75这些都是这个文件的前面两个字节的16进制数
  214. switch (p) {
  215. case 0xefbb:
  216. code = "UTF-8";
  217. break;
  218. case 0xfffe:
  219. code = "Unicode";
  220. break;
  221. case 0xfeff:
  222. code = "UTF-16BE";
  223. break;
  224. case 0x5c75:
  225. code = "ANSI|ASCII" ;
  226. break ;
  227. default:
  228. code = "GBK";
  229. }
  230. return code;
  231. }
  232. }
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值