1.爬虫URL去重实战-SpringBoot2.x+Guava布隆过滤器
< dependency>
< groupId> org.springframework.boot</ groupId>
< artifactId> spring-boot-starter-web</ artifactId>
</ dependency>
< dependency>
< groupId> org.springframework.boot</ groupId>
< artifactId> spring-boot-starter-test</ artifactId>
< scope> test</ scope>
</ dependency>
< dependency>
< groupId> org.apache.commons</ groupId>
< artifactId> commons-lang3</ artifactId>
< version> 3.12.0</ version>
</ dependency>
< dependency>
< groupId> com.google.guava</ groupId>
< artifactId> guava</ artifactId>
< version> 31.1-jre</ version>
</ dependency>
@Test
public void testGeneUrl ( ) {
try {
File file = new File ( "D:\\ideaworkspace\\bloomfilter-test\\src\\main\\resources" ) ;
if ( ! file. exists ( ) ) {
file. createNewFile ( ) ;
}
FileOutputStream fos = new FileOutputStream ( file, true ) ;
OutputStreamWriter osw = new OutputStreamWriter ( fos) ;
BufferedWriter bw = new BufferedWriter ( osw) ;
StringBuilder builder = new StringBuilder ( ) ;
for ( int i = 0 ; i < 5000000 ; i++ ) {
String name = RandomStringUtils . randomAlphabetic ( 5 ) ;
String fileName = "https://www." + name + ".com" + i + "\n" ;
builder. append ( fileName) ;
}
bw. write ( String . valueOf ( builder) ) ;
bw. newLine ( ) ;
bw. flush ( ) ;
bw. close ( ) ;
osw. close ( ) ;
fos. close ( ) ;
} catch ( FileNotFoundException e1) {
e1. printStackTrace ( ) ;
} catch ( IOException e2) {
e2. printStackTrace ( ) ;
}
}
//参数一: 指定布隆过滤器中存的是什么类型的数据,有 IntegerFunnel,LongFunnel,StringCharsetFunnel
//参数二: 预期需要存储的数据量
//参数三: 误判率,默认是 0.03
BloomFilter.create( Funnels.stringFunnel( Charset.forName( "UTF-8" )) , 5000000 , 0.01 ) ;
@Bean的方式将文件的内容注入到BloomFilter中
@Bean
public BloomFilter bloomFilter ( ) throws IOException {
BloomFilter bloomFilter = BloomFilter . create ( Funnels . stringFunnel ( Charset . forName ( "UTF-8" ) ) , 5000000 , 0.01 ) ;
FileInputStream fileInputStream = new FileInputStream ( new File ( "D:\\ideaworkspace\\bloomfilter-test\\src\\main\\resources\\url.txt" ) ) ;
InputStreamReader inputStreamReader = new InputStreamReader ( fileInputStream) ;
BufferedReader bufferedReader = new BufferedReader ( inputStreamReader) ;
String line;
while ( true ) {
line = bufferedReader. readLine ( ) ;
if ( line != null ) {
bloomFilter. put ( line) ;
} else {
break ;
}
}
inputStreamReader. close ( ) ;
return bloomFilter;
}
@RestController
@RequestMapping ( "/api" )
public class BloomFilterController {
@Autowired
private BloomFilter bloomFilter;
@RequestMapping ( "/bloomFilter" )
public boolean bloomFilter ( ) {
String url = "https://www.TpxVs.com10" ;
boolean flag = false ;
if ( bloomFilter. mightContain ( url) ) flag = true ;
return flag;
}
}
如果使用Set集合的话当数据量很大的情况下,会报堆内存溢出的报错。