#-*-coding:utf-8-*-
import urllib
import httplib
from sgmllib import SGMLParser
from urlparse import urlparse
class SuningParser(SGMLParser):
errcode=''
errmsg=''
urlList = []
def reset(self):
SGMLParser.reset(self)
self.urlList = []
def start_a(self,attrs):
linkUrl = [v for k,v in attrs if k == 'href']
if not linkUrl[0].find('http'):
print linkUrl[0].find('http')
self.urlList.append(linkUrl[0]+"\r\n")
class NetUtil:
def http_get(self,url,timeout=5,is_https=False):
domain=query_str=data=''
o=urlparse(url)
domain=o.netloc
if ''!=o.path or ''!=o.query:
query_str=o.path+'?'+o.query
if is_https:
conn=httplib.HTTPSConnection(domain,443,timeout)
else:
conn=httplib.HTTPConnection(domain,80,timeout)
conn.request('GET',query_str)
resp=conn.getresponse()
status=resp.status
if 200==status:
data=resp.read()
else:
self.errcode=''
self.errmsg='http response code(%s):%s' % (status,resp.reason)
conn.close()
return data
def http_post(self,url,ps={},timeout=5,is_https=False):
headers={"Content-type": "application/x-www-form-urlencoded","Accept": "text/plain"}
domain=query_str=data=''
o=urlparse(url)
domain=o.netloc
if ''!=o.path or ''!=o.query:
query_str=o.path+'?'+o.query
if is_https:
conn=httplib.HTTPSConnection(domain,443,timeout)
else:
conn=httplib.HTTPConnection(domain,80,timeout)
ps=urllib.urlencode(ps)
conn.request('POST',query_str,ps,headers)
resp=conn.getresponse()
status=resp.status
if 200==status:
data=resp.read()
else:
self.errcode=''
self.errmsg='http response code(%s):%s' % (status,resp.reason)
conn.close()
return data
url = "http://www.suning.com"
page = urllib.urlopen(url)
parser = SuningParser()
parser.feed(page.read())
print parser.urlList
f = open('E:\py\wb.txt','w+')
f.writelines(parser.urlList)
f.close()
import urllib
import httplib
from sgmllib import SGMLParser
from urlparse import urlparse
class SuningParser(SGMLParser):
errcode=''
errmsg=''
urlList = []
def reset(self):
SGMLParser.reset(self)
self.urlList = []
def start_a(self,attrs):
linkUrl = [v for k,v in attrs if k == 'href']
if not linkUrl[0].find('http'):
print linkUrl[0].find('http')
self.urlList.append(linkUrl[0]+"\r\n")
class NetUtil:
def http_get(self,url,timeout=5,is_https=False):
domain=query_str=data=''
o=urlparse(url)
domain=o.netloc
if ''!=o.path or ''!=o.query:
query_str=o.path+'?'+o.query
if is_https:
conn=httplib.HTTPSConnection(domain,443,timeout)
else:
conn=httplib.HTTPConnection(domain,80,timeout)
conn.request('GET',query_str)
resp=conn.getresponse()
status=resp.status
if 200==status:
data=resp.read()
else:
self.errcode=''
self.errmsg='http response code(%s):%s' % (status,resp.reason)
conn.close()
return data
def http_post(self,url,ps={},timeout=5,is_https=False):
headers={"Content-type": "application/x-www-form-urlencoded","Accept": "text/plain"}
domain=query_str=data=''
o=urlparse(url)
domain=o.netloc
if ''!=o.path or ''!=o.query:
query_str=o.path+'?'+o.query
if is_https:
conn=httplib.HTTPSConnection(domain,443,timeout)
else:
conn=httplib.HTTPConnection(domain,80,timeout)
ps=urllib.urlencode(ps)
conn.request('POST',query_str,ps,headers)
resp=conn.getresponse()
status=resp.status
if 200==status:
data=resp.read()
else:
self.errcode=''
self.errmsg='http response code(%s):%s' % (status,resp.reason)
conn.close()
return data
url = "http://www.suning.com"
page = urllib.urlopen(url)
parser = SuningParser()
parser.feed(page.read())
print parser.urlList
f = open('E:\py\wb.txt','w+')
f.writelines(parser.urlList)
f.close()
print "done"
只能一层爬虫,对动态加载的http内容也没处理