个人借鉴网上爬虫代码改写的py

最新推荐文章于 2023-06-26 15:16:19 发布

qq_25803197

最新推荐文章于 2023-06-26 15:16:19 发布

阅读量307

点赞数

文章标签：爬虫

本文链接：https://blog.csdn.net/qq_25803197/article/details/52231122

版权

#-*-coding:utf-8-*-
import urllib
import httplib
from sgmllib import SGMLParser
from urlparse import urlparse

class SuningParser(SGMLParser):
errcode=''
errmsg=''
urlList = []
def reset(self):
SGMLParser.reset(self)
self.urlList = []
def start_a(self,attrs):
linkUrl = [v for k,v in attrs if k == 'href']
if not linkUrl[0].find('http'):
print linkUrl[0].find('http')
self.urlList.append(linkUrl[0]+"\r\n")


class NetUtil:
def http_get(self,url,timeout=5,is_https=False):
domain=query_str=data=''
o=urlparse(url)
domain=o.netloc
if ''!=o.path or ''!=o.query:
query_str=o.path+'?'+o.query

if is_https:
conn=httplib.HTTPSConnection(domain,443,timeout)
else:
conn=httplib.HTTPConnection(domain,80,timeout)

conn.request('GET',query_str)
resp=conn.getresponse()
status=resp.status
if 200==status:
data=resp.read()
else:
self.errcode=''
self.errmsg='http response code(%s):%s' % (status,resp.reason)

conn.close()
return data

def http_post(self,url,ps={},timeout=5,is_https=False):
headers={"Content-type": "application/x-www-form-urlencoded","Accept": "text/plain"}
domain=query_str=data=''
o=urlparse(url)
domain=o.netloc
if ''!=o.path or ''!=o.query:
query_str=o.path+'?'+o.query

if is_https:
conn=httplib.HTTPSConnection(domain,443,timeout)
else:
conn=httplib.HTTPConnection(domain,80,timeout)

ps=urllib.urlencode(ps)
conn.request('POST',query_str,ps,headers)
resp=conn.getresponse()
status=resp.status
if 200==status:
data=resp.read()
else:
self.errcode=''
self.errmsg='http response code(%s):%s' % (status,resp.reason)

conn.close()
return data

url = "http://www.suning.com"
page = urllib.urlopen(url)
parser = SuningParser()
parser.feed(page.read())
print parser.urlList



f = open('E:\py\wb.txt','w+')
f.writelines(parser.urlList)
f.close()

print "done"

只能一层爬虫，对动态加载的http内容也没处理

qq_25803197

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
个人借鉴网上爬虫代码改写的py

#-*-coding:utf-8-*-import urllibimport httplib from sgmllib import SGMLParserfrom urlparse import urlparseclass SuningParser(SGMLParser): errcode='' errmsg='' urlList =
复制链接

扫一扫