java最长公共子串_java-大文本中最长的公共子字符串

我有这个学校的作业,要求我们编写代码以找到最长的公用子字符串.我已经做到了,但是它只适用于不是很大的文本,并且被要求为Moby Dick和War And Peace找到共同的子字符串.如果您能指出我做错事情的正确方向,我将不胜感激.编译器抱怨当我调用它创建SuffixArray时,错误在MyString类的substring方法中,但idk为什么它说的太大了,这给了我内存不足的问题

package datastructuresone;

import java.io.File;

import java.io.FileNotFoundException;

import java.util.Arrays;

import java.util.Scanner;

class SuffixArray

{

private final MyString[] suffixes;

private final int N;

public SuffixArray(String s)

{

N = s.length();

MyString snew = new MyString(s);

suffixes = new MyString[N];

for (int i = 0; i < N; i++)

{

suffixes[i] = snew.substring(i);

}

Arrays.sort(suffixes);

}

public int length()

{

return N;

}

public int index(int i)

{

return N - suffixes[i].length();

}

public MyString select(int i)

{

return suffixes[i];

}

// length of longest common prefix of s and t

private static int lcp(MyString s, MyString t)

{

int N = Math.min(s.length(), t.length());

for (int i = 0; i < N; i++)

{

if (s.charAt(i) != t.charAt(i))

{

return i;

}

}

return N;

}

// longest common prefix of suffixes(i) and suffixes(i-1)

public int lcp(int i)

{

return lcp(suffixes[i], suffixes[i - 1]);

}

// longest common prefix of suffixes(i) and suffixes(j)

public int lcp(int i, int j)

{

return lcp(suffixes[i], suffixes[j]);

}

}

public class DataStructuresOne

{

public static void main(String[] args) throws FileNotFoundException

{

Scanner in1 = new Scanner(new File("./build/classes/WarAndPeace.txt"));

Scanner in2 = new Scanner(new File("./build/classes/MobyDick.txt"));

StringBuilder sb = new StringBuilder();

StringBuilder sb1 = new StringBuilder();

while (in1.hasNextLine())

{

sb.append(in1.nextLine());

}

while (in2.hasNextLine())

{

sb1.append(in2.nextLine());

}

String text1 = sb.toString().replaceAll("\s+", " ");

String text2 = sb1.toString().replaceAll("\s+", " ");

int N1 = text1.length();

int N2 = text2.length();

SuffixArray sa = new SuffixArray(text1 + "#" + text2);

int N = sa.length();

String substring = "";

for (int i = 1; i < N; i++)

{

// adjacent suffixes both from second text string

if (sa.select(i).length() <= N2 && sa.select(i - 1).length() <= N2)

{

continue;

}

// adjacent suffixes both from first text string

if (sa.select(i).length() > N2 + 1 && sa.select(i - 1).length() > N2 + 1)

{

continue;

}

// check if adjacent suffixes longer common substring

int length = sa.lcp(i);

if (length > substring.length())

{

substring = sa.select(i).toString().substring(0, length);

System.out.println(substring + " ");

}

}

System.out.println("The length of the substring " + substring.length() + "length on first N " + N1 + " length of Second N " + N2

+ "The length of the array sa: " + N);

System.out.println("'" + substring + "'");

final class MyString implements Comparable

{

public MyString(String str)

{

offset = 0;

len = str.length();

arr = str.toCharArray();

}

public int length()

{

return len;

}

public char charAt(int idx)

{

return arr[ idx + offset];

}

public int compareTo(MyString other)

{

int myEnd = offset + len;

int yourEnd = other.offset + other.len;

int i = offset, j = other.offset;

for (; i < myEnd && j < yourEnd; i++, j++)

{

if (arr[ i] != arr[ j])

{

return arr[ i] - arr[ j];

}

}

// reached end. Who got there first?

if (i == myEnd && j == yourEnd)

{

return 0; // identical strings

}

if (i == myEnd)

{

return -1;

} else

{

return +1;

}

}

public MyString substring(int beginIndex, int endIndex)

{

return new MyString(arr, beginIndex + offset, endIndex - beginIndex);

}

public MyString substring(int beginIndex)

{

return substring(beginIndex, offset + len);

}

public boolean equals(Object other)

{

return (other instanceof MyString) && compareTo((MyString) other) == 0;

}

public String toString()

{

return new String(arr, offset, len);

}

private MyString(char[] a, int of, int ln)

{

arr = a;

offset = of;

len = ln;

}

private char[] arr;

private int offset;

private int len;

}

最佳答案

这里:

for (int i = 0; i < N; i++)

{

suffixes[i] = snew.substring(i);

}

您不仅要存储整个长字符串,还要存储整个字符串-1个字母,以及整个字符串-2个字母,等等.所有这些都分别存储.

如果您的字符串只有10个字母,那么您将在10个不同的字符串中存储总共55个字符.

以1000个字符为单位,您总共存储500500个字符.

更一般而言,您必须处理length *(length 1)/ 2个字符.

只是为了好玩,我不知道《战争与和平》中有多少个字符,但是页面数约为1250,典型的单词/页面估计为250,平均单词的长度约为5个字符,得出:

(1250 * 250 * 5)*(1250 * 250 * 5 1)/ 2 = 1.2207039 * 10 ^ 12个字符.

内存中char的大小为2个字节,因此您需要大约2.22 TB的大小(相比之下,小说中只有1.49 MB).

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值