将多个文件分割成m部分,并随机读取每个部分。
package utils
import (
"bufio"
"io"
"log"
"os"
"strings"
)
// SplitableFile
/**
* accurate to one line.
* @author delicious
*/
type SplitableFile struct {
m int
files []string
// for one partition, begin with
offsetF []int // which file
offsetL []int // which line
lines []int // and, how many lines contains
}
func FromFiles(files []string, m int) SplitableFile {
res := SplitableFile{
files: files,
m: m,
offsetF: make([]int, m, m),
offsetL: make([]int, m, m),
lines: make([]int, m, m),
}
res.init()
return res
}
func (sf *SplitableFile) init() {
m := sf.m
sl := make([]int, len(sf.files), len(sf.files))
for i, s := range sf.files {
lines := 0
file, err := os.Open(s)
if err != nil {
log.Fatalln(err)
}
r := bufio.NewReader(file)
for {
_, prefix, eof := r.ReadLine()
if eof == io.EOF {
break
}
if !prefix {
lines++
}
}
_ = file.Close()
sl[i] = lines
}
sum := 0
for _, i := range sl {
sum += i
}
for i := range sf.lines {
sf.lines[i] = sum / m
}
sf.lines[m-1] = sum - (m-1)*(sum/m)
fid := 0
lid := 0
for i, need := range sf.lines {
sf.offsetF[i] = fid
sf.offsetL[i] = lid
padding:
{
if need == sl[fid]-lid {
fid++
lid = 0
} else if need < sl[fid]-lid {
lid += need
} else {
need -= sl[fid] - lid
fid++
lid = 0
goto padding
}
}
}
}
func (sf *SplitableFile) ReadSplit(ith int) string {
fid := sf.offsetF[ith]
r, f := sf.reader(fid)
// skip
for i := 0; i < sf.offsetL[ith]; i++ {
_, _, _ = r.ReadLine()
}
builder := strings.Builder{}
for i := 0; i < sf.lines[ith]; i++ {
bytes, _, eof := r.ReadLine()
if eof == io.EOF {
_ = f.Close()
fid++
r, f = sf.reader(fid)
}
if builder.Len() > 0 {
builder.WriteString("\n")
}
builder.WriteString(string(bytes))
}
_ = f.Close()
return builder.String()
}
func (sf *SplitableFile) reader(fid int) (*bufio.Reader, *os.File) {
s := sf.files[fid]
f, err := os.Open(s)
if err != nil {
log.Fatalln(err)
}
return bufio.NewReader(f), f
}