java写了个网络爬虫,用来拔百度贴吧的图片,不是很成熟,可以纪念下
package spider;
import java.io.*;
import java.net.*;
import java.util.*;
public class Spider{
private LinkedList<String> titleList;
private TreeMap<String,LinkedList<String>> filename_tiezhiLinkList; //保存帖子名称,及一个帖子所有的链接
private TreeMap<String,String>href_title;
private String initOutPage;
public Spider() throws IOException, Exception{
initOutPage=new String("initOutPage.html");
FileWriter outHtml=new FileWriter(initOutPage);
outHtml.write(getUrlDetail("http://tieba.baidu.com/f?kw=%C6%CF%CC%D1%C4%BE%B6%FA", true));
outHtml.close();
titleList=new LinkedList<String>();
href_title=new TreeMap<String,String>();
filename_tiezhiLinkList=new TreeMap<String,LinkedList<String>>();
}
public void saveUrlFile(String fileUrl,String fileDes) throws Exception
{
File toFile=new File(fileDes);
if(toFile.exists())
{
return ;
}
if(!toFile.createNewFile())
return ;
toFile.createNewFile();
FileOutputStream outImgStream = new FileOutputStream(toFile);
outImgStream.write(getUrlFileData(fileUrl));
outImgStream.close();
}
public byte[]getUrlFileData(String fileUrl)throws Exception
{
URL url=new URL(fileUrl);
HttpURLConnection httpConn=(HttpURLConnection)url.openConnection();
InputStream cin=httpConn.getInputStream();
ByteArrayOutputStream outStream= new ByteArrayOutputStream();
byte[]buffer=new byte[1024];
int len=0;
while((len=cin.read(buffer))!=-1)
{
outStream.write(buffer,0,len);
}
cin.close();
byte[]fileData=outStream.toByteArray();
outStream.close();
return fileData;
}
public static String getUrlDetail(String urlStr,boolean withSep)throws Exception
{
URL url=new URL(urlStr);
HttpURLConnection httpConn=(HttpURLConnection)url.openConnection();
InputStream cin=httpConn.getInputStream();
BufferedReader reader=new BufferedReader(new InputStreamReader(cin,"gb2312"));
String rl=null;
StringBuffer sb=new StringBuffer();
if(withSep){
while((rl=reader.readLine())!=null)
{
//sb.append(rl).append(System.getProperty("line.separator"));
sb.append(rl).append("\n");
}
}
else{
while((rl=reader.readLine())!=null)
sb.append(rl);
}
return sb.toString();
}
public String getTitleLink(String filename)throws Exception
{
File file=new File(filename);
FileReader fileReader=new FileReader(file);
BufferedReader br=new BufferedReader(fileReader);
String rl=null;
while((rl=br.readLine())!=null)
{
rl=rl.trim();
if((rl.startsWith("<meta furl="))&&rl.endsWith("\">"))
{
System.out.println(rl);
String substr[]=rl.split("\"");
br.close();
fileReader.close();
return substr[1];
}
}
return null;
}
public void storePage() throws Exception
{
String titleLink=getTitleLink(initOutPage);
String temp="http://"+titleLink+"&tp=0&pn=";
for(int i=0;i<=50;i=i+50)
{
titleList.add(new String(temp+Integer.toString(i)));
System.out.println(titleList.peekLast());
System.out.println(titleList.size());
}
}
//进入帖子标签,用于以后个进行图片抓取
public void storeTiezhiLink() throws MalformedURLException, IOException
{
while(titleList.size()!=0)
{
String tempLink=titleList.removeFirst();
System.out.println(tempLink);
System.out.println(titleList.size());
URL url=new URL(tempLink);
HttpURLConnection httpURLConn=(HttpURLConnection)url.openConnection();
InputStream cin=httpURLConn.getInputStream();
BufferedReader reader=new BufferedReader(new InputStreamReader(cin,"gb2312"));
String str=null;
while((str=reader.readLine())!=null)
{
//System.out.println(str+"\n\n\n\n\n");
str=str.trim();
if((str.startsWith("<a href=\"/p/")))
{
//System.out.println(str);
String subStr[]=str.split("\"");
String href=subStr[1];
String title=subStr[3];
System.out.println(href+" "+title);
if(!href_title.containsKey(href))
href_title.put(new String("http://tieba.baidu.com"+href), new String(title));
}
}
// System.out.println("haha"+str);
/* if(str!=null)
{
String subStr[]=str.split("\"");
System.out.println(subStr[1]);
int offset=subStr[1].lastIndexOf("=");
int count=subStr[1].length()-1-offset;
String temp=new String(subStr[1].toCharArray(),offset+1,count);
System.out.println(temp);
int counttemp=Integer.parseInt(temp);
System.out.println(counttemp);
String partLink=new String(subStr[1].toCharArray(),0,subStr[1].length()-count);
System.out.println(partLink);
// System.out.println(new String("http://tieba.baidu.com"+partLink+Integer.toString(1)));
for(int i=1;i<=counttemp;i++)
{
int j=i;
System.out.println(new String("http://tieba.baidu.com"+partLink+Integer.toString(j)));
String temptemp=new String("http://tieba.baidu.com"+partLink+Integer.toString(j));
tiezhiLinkList.add(temptemp);
// System.out.println(i);
// System.out.println("http://tieba.baidu.com"+partLink+Integer.toString(i));
}
}
else{
tiezhiLinkList.add(tempLink);
}
*/
}
}
//得到每一个帖子的所有页数的链接
public void getTieZhi_ALL_Link() throws MalformedURLException, IOException
{
for (Iterator<Map.Entry<String, String>> it = href_title.entrySet().iterator(); it.hasNext();) {
Map.Entry ent = it.next();
String keyt=(String) ent.getKey();
String valuet=(String) ent.getValue();
//System.out.println(keyt+"*"+valuet);
URL url=new URL(keyt);
HttpURLConnection httpURLConnection=(HttpURLConnection)url.openConnection();
InputStream cin=httpURLConnection.getInputStream();
BufferedReader reader=new BufferedReader(new InputStreamReader(cin,"gb2312"));
String strline=null;String str=null;
while((strline=reader.readLine())!=null)
{
if((strline.startsWith("<a href=\"/p/"))&&strline.endsWith(">尾页</a>"))
{
str=strline;
break;
}
}
if(str!=null)
{
String subStr[]=str.split("\"");
// System.out.println(subStr[1]);
int offset=subStr[1].lastIndexOf("=");
int count=subStr[1].length()-1-offset;
String temp=new String(subStr[1].toCharArray(),offset+1,count);
// System.out.println(temp);
int counttemp=Integer.parseInt(temp);
System.out.println("一个帖子的页数: "+counttemp);
String partLink=new String(subStr[1].toCharArray(),0,subStr[1].length()-count);
// System.out.println(partLink);
// System.out.println(new String("http://tieba.baidu.com"+partLink+Integer.toString(1)));
LinkedList<String>tiezhiLinkList= tiezhiLinkList=new LinkedList<String>();
for(int i=1;i<=counttemp;i++)
{
System.out.println(new String("http://tieba.baidu.com"+partLink+Integer.toString(i)));
String temptemp=new String("http://tieba.baidu.com"+partLink+Integer.toString(i));
tiezhiLinkList.add(temptemp);
}
///Filename - total-link
if(!filename_tiezhiLinkList.containsKey(valuet))
{
filename_tiezhiLinkList.put(valuet, tiezhiLinkList);
}
}
else{
LinkedList<String>tiezhiLinkList=new LinkedList<>();
tiezhiLinkList.add(keyt);
filename_tiezhiLinkList.put(valuet,tiezhiLinkList);
}
//System.out.println(href_title.size());
System.out.println("帖子的个数: "+filename_tiezhiLinkList.size());
}
}
//得到给一个网页上面的图片的URL
public LinkedList<String> get_image_URL(String link) throws MalformedURLException, IOException
{
LinkedList<String> imagelist=new LinkedList<String>();
URL url=new URL(link);
HttpURLConnection httpURLConnection = (HttpURLConnection)url.openConnection();
InputStream cin=httpURLConnection.getInputStream();
BufferedReader br=new BufferedReader(new InputStreamReader(cin,"gb2312"));
String str=null;
while((str=br.readLine())!=null)
{
if(str.contains("<img class=\"BDE_Image\" src="))
{
String subStr[]=str.split("\"");
String tempStr=subStr[3];
System.out.println(tempStr);
if(!tempStr.endsWith(".jpg"))
continue;
if(imagelist.contains(tempStr))
continue;
imagelist.add(tempStr);
}
}
return imagelist;
}
public void getSource() throws MalformedURLException, IOException, Exception
{
TreeMap<String,LinkedList<String>> title_linklist=filename_tiezhiLinkList;
File Source=new File("Source");
for (Map.Entry ent : title_linklist.entrySet()) {
String title=(String)ent.getKey();
LinkedList<String> linklist=(LinkedList)ent.getValue();
System.out.println("...........................");
System.out.println("titel: "+title);
for(int m=0;m<linklist.size();m++)
System.out.println(linklist.get(m));
System.out.println("...........................");
String filename=title;
File saveFile=new File(new String(Source+"/"+filename));
saveFile.mkdir();
for(int i=0;i<linklist.size();i++)
{
LinkedList<String> imageLinkList=get_image_URL(linklist.get(i));
for(int j=0;j<imageLinkList.size();j++)
{
String imagename=new String(imageLinkList.get(j).toCharArray(),imageLinkList.get(j).length()-20,20);
String path=Source+"/"+filename+"/"+imagename;
System.out.println(imageLinkList.get(j));
if(filename==null)continue;
if(imagename==null)continue;
saveUrlFile(imageLinkList.get(j),path);
}
}
}
}
public static void main(String[]args){
try{
//System.out.println(Spider.getUrlDetail("http://tieba.baidu.com/p/2575268400", true));
Spider spider=new Spider();
//System.out.println(spider.getTitleLink();
// spider.saveUrlFile("http://tieba.baidu.com/p/2575268400","1.xiaojiba.html");
spider.storePage();
spider.storeTiezhiLink();
spider.getTieZhi_ALL_Link();
spider.getSource();
}catch(Exception e)
{
}
}
}
运行效果