java超快速文本去重复代码
import java.io.*; import java.util.HashSet; import java.util.Iterator; import java.util.Set; public class SpeedClear { public static void main(String[] args) { if(args.length==0){ print(); System.exit(1); } if(args.length!=2){ System.out.println("Format error..."); System.exit(1); } String pathname = args[0]; String newPath = args[1]; clear(pathname,newPath); //调用去重复的方法... } /** * * @param pathname * 源文件路径 * @param newPath * 新的文件路径 * @throws Exception */ public static void clear(String pathname, String newPath) { System.out.println("Start... "); try{ //懒的写Try..直接都包围起来吧.... File file = new File(pathname); BufferedInputStream fis = new BufferedInputStream(new FileInputStream(file)); BufferedReader buffer = new BufferedReader(new InputStreamReader(fis,"utf-8"),20*1024*1024);// 用5M的缓冲读取文本文件 //FileWriter fw = new FileWriter(new File(newPath),true); //去除后的文本 OutputStreamWriter out = new OutputStreamWriter(new FileOutputStream(new File(newPath)),"utf-8") ; Set<String> set = new HashSet<String>(); String temp = ""; // 临时字符串 int x = 0; while ((temp = buffer.readLine()) != null) { // 读文件,一行读一个 set.add(temp); // 存储到Set集合里面 if(x%30000==0){ System.out.print("..") ; } x++; } fis.close(); buffer.close(); //关闭读取操作 //下面开始写文件 for (String xxser : set) { out.write(xxser+"\r\n"); } System.out.println("") ; out.close(); //关闭写操作 System.out.println("size = " + set.size()); System.out.println("End..."); }catch(Exception e){ System.out.println("文件太大了,建议先100MB大小..") ; } } public static void print(){ System.out.println("*************************************************"); System.out.println("\t\tTo repeat \t\t"); System.out.println(); System.out.println(" format: java -Xmx1000m SpeedClear c:\\old.txt c:\\new.txt\t\t"); System.out.println(); System.out.println("\t\tAuthor:xxser QQ:616100108"); System.out.println("*************************************************"); } }