python 采集爬虫 demo 源代码 ulunwen 原创 禁止转载
#! /usr/bin/python
# -*- coding: gb2312 -*-
from bs4 import BeautifulSoup
import re
import urllib
import csv
import time,sys
realpage=[]
def save_csv(linedata=[]):
csvfile = file('data.txt','a')
writer = csv.writer(csvfile)
writer.writerows(linedata)
csvfile.close()
def GetPageText(links=[]):
pageinfo=[]
detailinfo=[]
for link in links:
PageContent = urllib.urlopen(link).read()
pageContent = unicode(PageContent, "gb2312").encode("utf8")
PageSoup = BeautifulSoup(PageContent,'html.parser')
#Get page title
time.sleep(1)
detailinfo.append(link)
detailinfo.append(re.compile(r'<[^>]+>',re.S).sub('',''.join(PageSoup.title)))
detailinfo.append(repr(PageSoup.title.next_sibling.next_sibling.get('content')))
#desc= repr(PageSoup.title.next_sibling.next_sibling.get('content')))
time.sleep(1)
print "Start write file"
print detailinfo
raw_input()
save_csv(detailinfo)
time.sleep(1)
#print pageinfo
#save_csv(pageinfo)
def webopen(link):
content = urllib.urlopen(link).read()
soup = BeautifulSoup(content,'html.parser').find('div',class_='catalog05 catalog05d')
alink = soup.find_all('a', {'target':'_blank'})
for al in alink:
urlhref=al.get('href')
if "html" in urlhref:
print urlhref
realpage.append(urlhref)
else:
continue
GetPageText(realpage)
#link = ["index.html"]
link = 'http://ulunwen.com'
webopen(link)
#GetPageText(link)