Jsoup抓取唐诗三百首
需求
- 抓取唐诗三百首
- 获得每一行的最后两个词
- 拼成xx.com,验证该域名是否存在
- 得出没有注册域名的两个词
工具
用到的工具Jsoup,maven,java
代码
maven配置文件
<dependencies>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.7.2</version>
</dependency>
<dependency>
<groupId>com.github.stuxuhai</groupId>
<artifactId>jpinyin</artifactId>
<version>1.1.8</version>
</dependency>
</dependencies>
代码实现
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.github.stuxuhai.jpinyin.PinyinFormat;
import com.github.stuxuhai.jpinyin.PinyinHelper;
public class Name {
public static void main(String[] args) {
try{
Document doc = Jsoup.connect("http://www.gushiwen.org/gushi/tangshi.aspx").get();
Elements divs = doc.select("a[href$=.aspx]");
for(int i=0 ;i<divs.size(); i++) {
Element div = divs.get(i);
if(div.attr("href").length() == 25) {
String url = "http://www.gushiwen.org" + div.attr("href");
printLast2Name(url);
}
}
} catch(Exception e) {
}
}
public static void printLast2Name(String url){
try{
Document doc = Jsoup.connect(url).get();
Element pobject = doc.select("p[align$=center]").first();
String text = pobject.text();
String[] split = text.split(",|。|?");
for(String str : split) {
String lastName = str.substring(str.length()-2, str.length());
String pinyin = PinyinHelper.convertToPinyinString(lastName, "", PinyinFormat.WITHOUT_TONE);
if(register(pinyin + ".com")) {
System.out.print(lastName);
System.out.print("("+ pinyin + ") ");
}
}
System.out.println();
System.out.println();
} catch(Exception e) {
}
}
public static boolean register(String url) {
try {
Document doc = Jsoup.connect("http://panda.www.net.cn/cgi-bin/check.cgi?area_domain=" + url).get();
if(doc.text().contains("210 : Domain name is available")) {
return true;
}
} catch (IOException e) {
e.printStackTrace();
}
return false;
}
}