<
dependency
>
<
groupId
>
org.jsoup
</
groupId
>
<
artifactId
>
jsoup
</
artifactId
>
<
version
>
1.8.3
</
version
>
</
dependency
>
package
com.tps.common
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.select.Elements
;
import
java.io.BufferedReader
;
import
java.io.InputStreamReader
;
import
java.net.URL
;
import
java.net.URLConnection
;
/**
* (
爬虫
)
*
*
@author
Sum
*
@date
2017-09-29 9:13
*/
public class
Robot
{
public static void
getQuestion
(
int
page
){
//
定义即将访问的链接
String
url
=
"https://www.nowcoder.com/ta/review-java/review?page="
+
page
;
//
定义一个字符串用来存储网页内容
String
result
=
""
;
//
定义一个缓冲字符输入流
BufferedReader
in
=
null
;
try
{
//
将
string
转成
url
对象
URL
realUrl
=
new
URL
(
url
);
//
初始化一个链接到那个
url
的连接
URLConnection
connection
=
realUrl
.
openConnection
();
//
开始实际的连接
connection
.
connect
();
//
初始化
BufferedReader
输入流来读取
URL
的响应
in
=
new
BufferedReader
(
new
InputStreamReader
(
connection
.
getInputStream
()));
//
用来临时存储抓取到的每一行的数据
String
line
;
while
((
line
=
in
.
readLine
()) !=
null
)
{
//
遍历抓取到的每一行并将其存储到
result
里面
result
+=
line
+
"
\n
"
;
}
}
catch
(
Exception
e
)
{
System
.
out
.
println
(
"
发送
GET
请求出现异常!
"
+
e
);
e
.
printStackTrace
();
}
//
使用
finally
来关闭输入流
finally
{
try
{
if
(
in
!=
null
)
{
in
.
close
();
}
}
catch
(
Exception
e2
)
{
e2
.
printStackTrace
();
}
}
// System.out.println(result);
Document
doc
=
Jsoup
.
parse
(
result
);
Elements
title
=
doc
.
getElementsByClass
(
"final-question"
);
System
.
out
.
println
(
page
+
"."
+
title
.
get
(
0
).
ownText
());
Elements
question
=
doc
.
getElementsByClass
(
"design-answer-box"
);
System
.
out
.
println
(
question
.
get
(
0
).
ownText
());
System
.
out
.
println
(
question
.
get
(
0
));
}
public static void
main
(
String
[]
args
)
{
/*for(int i=1;i<120;i++){
getQuestion(i);
}*/
getQuestion
(
13
);
}
}