__author__ = 'bloodchilde'
import urllib
import urllib2
import re
import os
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )
class Spider:
def __init__(self):
self.siteUrl="http://aoshu.juren.com/tiku/xiaoxueaoshu/"
self.user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'
self.headers = { 'User-Agent' : self.user_agent }
def getPageContent(self,url):
request = urllib2.Request(url,headers = self.headers)
response = urllib2.urlopen(request)
return response.read().decode("utf-8")
def getSiteContent(self):
siteContents = self.getPageContent(self.siteUrl)
sitePattern = re.compile('<div.*?class="news".*?>.*?<h3.*?class="newdot".*?>.*?<a.*?href="(.*?)".*?>(.*?)</a>.*?</h3>.*?</div>',re.S)
items = re.findall(sitePattern,siteContents)
contents=[]
for item in items:
contents.append([item[0],item[1]])
return contents
def enterAoshuTiPage(self,gradeLevel,url):
curPageContent = self.getPageContent(url)
curPattern = re.compile('<li>.*?<span.*?class="left".*?>.*?<a.*?href="(.*?)".*?target="_blank".*?>(.*?)</a>.*?</span>.*?</li>',re.S)
items = re.findall(curPattern,curPageContent)
contents =[]
for item in items:
url = item[0]
timuName = item[1]
contents.append([url,timuName])
pattent = re.compile('<div.*?id="pages".*?>.*?<strong>.*?</strong>.*?<strong>(.*?)</strong>.*?</div>',re.S)
pageCounts = re.findall(pattent,curPageContent)
return contents,pageCounts[0]
def getAoshutiContent(self,url,path):
pageContent = self.getPageContent(url)
pattern = re.compile('<div.*?class="mainContent".*?>.*?<p>'+
'.*?</p>.*?<p>(.*?)</p>.*?<div.*?id="page".*?>.*?<span.*?class="current".*?>.*?</span>.*?<a.*?href="(.*?)".*?>.*?</a>.*?</div>.*?</div>',re.S)
items = re.findall(pattern,pageContent)
path = path +u"/test.txt"
f = open(path, 'a+')
for item in items:
timu = item[0]
daAnUrl = item[1]
daAnContent = self.getDaAn(daAnUrl)
print "question:"+timu
print "answer:"+daAnContent
fileContents = "question:"+timu+"\r\n"+"answer:"+daAnContent+"\r\n\r\n\r\n\r\n"
f.write(fileContents)
f.close()
def mk_dir(self,path):
isExisist = os.path.exists(path)
if not isExisist:
os.makedirs(path)
return True
else:
return False
def getDaAn(self,url):
page = self.getPageContent(url)
pattern = re.compile('<div.*?class="mainContent".*?>.*?<p>.*?</p>.*?<p>(.*?)</p>.*?</div>',re.S)
items = re.findall(pattern,page)
return items[0]
def getAoShuTi(self,grade,url):
global_url = url
path = u"C:/Users/bloodchilde/Desktop/image_python/"+grade
self.mk_dir(path)
contents,pageCount = self.enterAoshuTiPage(grade,global_url)
for pageIndex in range(1,int(pageCount)):
if pageIndex != 1:
url = global_url+u"/index_"+str(pageIndex)+".html"
contents,count= self.enterAoshuTiPage(grade,url)
for item in contents:
url = item[0]
name = item[1].split(":")[1]
print name
self.getAoshutiContent(url,path)
else:
for item in contents:
url = item[0]
name = item[1].split(":")[1]
print name
self.getAoshutiContent(url,path)
demo = Spider()
contents = demo.getSiteContent()
for content in contents:
url = content[0]
grade = content[1]
print grade
print "---------------------------------------"
demo.getAoShuTi(grade,url)
分析:
1.奥数题题源网页地址:http://aoshu.juren.com/tiku/xiaoxueaoshu/
2.分析源码,获取有几个年级的奥数题并且定位到那个年级的URL
3,解析步骤2中的URL,获取页数和题源URL,
4,解析题源URL,获取题目内容和答案的URL
5,解析答案的URL获取答案内容
6,将获取的题目和答案写进文件