Java爬虫Jsoup+正则表达式爬取必应壁纸
import org. jsoup. Jsoup;
import org. jsoup. nodes. Document;
import java. io. File;
import java. io. FileOutputStream;
import java. util. Scanner;
import java. util. regex. Matcher;
import java. util. regex. Pattern;
public class DIYMain2 {
public static void main ( String[ ] args) throws Exception {
int page = 1 ;
Scanner scanner = new Scanner ( System. in) ;
System. out. println ( "请输入你要下载多少页" ) ;
int howMuchPage = scanner. nextInt ( ) ;
while ( page <= howMuchPage) {
Document document = Jsoup. connect ( "https://bing.ioliu.cn/?p=" + page + "" ) . get ( ) ;
Matcher matcher = Pattern. compile ( "<a class=\"mark\" href=\"(.*?)\"></a>" ) . matcher ( document. toString ( ) ) ;
int i = 1 ;
while ( matcher. find ( ) ) {
String newURL = "https://bing.ioliu.cn/" + matcher. group ( 1 ) + "" ;
String regex = "<img class=\"target progressive__img progressive--not-loaded\" src=\".*?\" data-progressive=\"(.*?)\">" ;
Matcher matcher1 = Pattern. compile ( regex) . matcher ( Jsoup. connect ( newURL) . get ( ) . toString ( ) ) ;
while ( matcher1. find ( ) ) {
try {
byte [ ] bytes = Jsoup. connect ( matcher1. group ( 1 ) ) . ignoreContentType ( true ) . execute ( ) . bodyAsBytes ( ) ;
File filePath = new File ( "C://bring//" ) ;
if ( ! filePath. exists ( ) )
filePath. mkdir ( ) ;
FileOutputStream fileOutputStream = new FileOutputStream ( "" + filePath+ "/第" + page + "页-第" + i + "张.jpg" ) ;
fileOutputStream. write ( bytes) ;
} catch ( Exception e) {
continue ;
}
System. out. println ( "第" + page + "页-第" + i + "张.jpg正在下载......." ) ;
i++ ;
}
}
page++ ;
}
System. out. println ( "下载完成!" ) ;
}
}