转载

利用归并排序算法对大文件进行排序

归并排序算法介绍,请参照Wikipeida

zh.wikipedia.org/wiki/%E5%BD%92%E5%B9%B6%E6%8E%92%E5%BA%8F

基本思想:

大文件分割成行数相等的两个小文件,使用递归一直分割到所有所有小文件低于限制行数

小文件直接排序

两个排序好的小文件归并到大文件

直到最后所有排序好的文件归并到输入的大文件并返回

之前看了网上很多示例代码,写的很不简洁, 引入了过多的临时变量i, j, k等等, 导致程序基本没法看,

只好自己写了一个,没有很关心执行效率, 只求够用,以后有机会再优化一下吧。

JDK要求Java 8

package com.java.sort.merge; import com.google.common.base.Charsets; import com.google.common.base.Strings; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterators; import com.google.common.collect.PeekingIterator; import com.google.common.io.Files; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.io.LineIterator; import org.apache.commons.io.filefilter.AndFileFilter; import org.apache.commons.io.filefilter.PrefixFileFilter; import org.apache.commons.io.filefilter.SuffixFileFilter; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; import java.io.File; import java.io.FilenameFilter; import java.io.IOException; import java.io.Writer; import java.util.ArrayList; import java.util.Collections; import java.util.List; public class FileMergeSort {  private static int limit = 9999;  private static void cleanTempFiles() {   FilenameFilter filter = new AndFileFilter(ImmutableList.of(new PrefixFileFilter("sort"), new SuffixFileFilter(".part")));   ImmutableList.copyOf(FileUtils.getTempDirectory().listFiles(filter)).forEach(File::delete);  }  private static int lineNumber(File input) throws IOException {   int count = 0;   LineIterator iterator = FileUtils.lineIterator(input);   while (iterator.hasNext()) {    iterator.next();    count++;   }   return count;  }  private static File split(File input, int from, int to) throws IOException {   File part = File.createTempFile("sort", ".part");   Long lineNumber = 0L;   String line = null;   List<String> lines = new ArrayList<>(to - from);   LineIterator iterator = FileUtils.lineIterator(input);   while (iterator.hasNext()) {    if (lineNumber > to) break;    line = iterator.next();    if (lineNumber >= from && lineNumber <= to) {     lines.add(line);    }    lineNumber++;   }   FileUtils.writeLines(part, lines);   return part;  }  private static File merge(File source, File left, File right) throws IOException {   PeekingIterator<String> leftLineIterator = Iterators.peekingIterator(FileUtils.lineIterator(left));   PeekingIterator<String> rightLineIterator = Iterators.peekingIterator(FileUtils.lineIterator(right));   String leftLine, rightLine;   try (Writer writer = Files.newWriter(source, Charsets.UTF_8)) {    writer.write("");    while (leftLineIterator.hasNext() && rightLineIterator.hasNext()) {     leftLine = leftLineIterator.peek();     rightLine = rightLineIterator.peek();     if (leftLine.compareTo(rightLine) < 0) {      writer.append(leftLine.concat(IOUtils.LINE_SEPARATOR));      leftLineIterator.next();     } else {      writer.append(rightLine.concat(IOUtils.LINE_SEPARATOR));      rightLineIterator.next();     }    }    while (leftLineIterator.hasNext()) {     writer.append(leftLineIterator.next().concat(IOUtils.LINE_SEPARATOR));    }    while (rightLineIterator.hasNext()) {     writer.append(rightLineIterator.next().concat(IOUtils.LINE_SEPARATOR));    }   }   return source;  }  private static File directSort(File input) throws IOException {   List<String> list = new ArrayList<>(limit);   FileUtils.lineIterator(input).forEachRemaining(list::add);   Collections.sort(list);   FileUtils.writeLines(input, list);   return input;  }  public static File mergeSort(File input) throws IOException {   int total = lineNumber(input);   if (total <= limit) {    return directSort(input);   }   int half = total / 2;   File left = mergeSort(split(input, 0, half));   File right = mergeSort(split(input, half + 1, total));   return merge(input, left, right);  }  @BeforeClass  public static void init() throws IOException {   cleanTempFiles();   try (Writer writer = Files.newWriter(new File("long.txt"), Charsets.UTF_8)) {    writer.write("");    for (long i = 99999L; i > 0L; i--) {     writer.append(Strings.padStart(String.valueOf(i), 5, '0').concat(IOUtils.LINE_SEPARATOR));    }   }  }  @AfterClass  public static void clean() {   cleanTempFiles();  }  @Test  public void testSort() throws IOException {   File sorted = mergeSort(new File("long.txt"));  } } 
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0"    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">  <modelVersion>4.0.0</modelVersion>  <groupId>com.java.app</groupId>  <artifactId>sample</artifactId>  <version>1.0-SNAPSHOT</version>  <dependencies>   <dependency>    <groupId>commons-io</groupId>    <artifactId>commons-io</artifactId>    <version>2.4</version>   </dependency>   <dependency>    <groupId>org.apache.commons</groupId>    <artifactId>commons-csv</artifactId>    <version>1.1</version>   </dependency>   <dependency>    <groupId>com.google.guava</groupId>    <artifactId>guava</artifactId>    <version>18.0</version>   </dependency>   <dependency>    <groupId>javax.servlet</groupId>    <artifactId>javax.servlet-api</artifactId>    <version>3.1.0</version>   </dependency>   <dependency>    <groupId>org.apache.logging.log4j</groupId>    <artifactId>log4j-api</artifactId>    <version>2.1</version>   </dependency>   <dependency>    <groupId>org.apache.logging.log4j</groupId>    <artifactId>log4j-core</artifactId>    <version>2.1</version>   </dependency>   <dependency>    <groupId>org.apache.logging.log4j</groupId>    <artifactId>log4j-jcl</artifactId>    <version>2.1</version>   </dependency>   <dependency>    <groupId>junit</groupId>    <artifactId>junit</artifactId>    <version>4.12</version>    <scope>test</scope>   </dependency>  </dependencies> </project> 
正文到此结束
Loading...