from mysql import connector
import hashlib
from _codecs import encode
class Quercy(object):
def __init__(self,mysqlDB ='crawler',mysqlUser='root'):
self.db=connector.Connect(db=mysqlDB,user=mysqlUser)
self.cursor=self.db.cursor()
def times(self,website):
'''查询访问次数'''
md5=self.__md5Digest__(website)
quercy=str('''SELECT times
FROM crawler.website
WHERE md5 = '%s'
''')%md5
self.cursor.execute(quercy)
times= self.cursor.fetchone()
if times is None:
return 0
else:
return times[0]
def restore(self,website,keywords):
'''把网页存入数据库'''
md5=self.__md5Digest__(website)
quercy = str('''INSERT INTO website (md5,pri_website,keywords)
VALUES('%s','%s','%s')
ON DUPLICATE KEY UPDATE times=times+1
''')%(md5,website,keywords)
self.cursor.execute(quercy)
self.db.commit()
def __md5Digest__(self,website):
'''原始网址转化为md5'''
md5=hashlib.md5()
md5.update(encode(website))
return md5.hexdigest()
if __name__=='__main__':
q = Quercy()
website = 'www.baidu.com'
q.restore(website,'baidu')
print(q.times(website))
from html.parser import HTMLParser
from _codecs import decode
class WebParser(HTMLParser):
def __init__(self):
super().__init__()
self.tag=None
self.href=[]#超链接
self.keyowrds=[]#关键字
def handle_starttag(self, tag, attrs):
#只关注meta和title标签
if tag =='meta' or tag =='title' or tag=='a':
self.tag=tag
else:
self.tag=None
#分析超链接和关键字
for attr in attrs:
if attr[0]=='href' and attr[1].startswith("http"):
self.href.append(attr[1])
if attr[0]=='content':
key=attr[1].split()
self.keyowrds += key
def handle_endtag(self, tag):
pass
def handle_data(self, data):
if self.tag == None:
return
if self.tag =='title':
self.keyowrds.append(data)
if __name__=='__main__':
from urllib.request import urlopen
parser = WebParser()
data = urlopen("http://www.taobao.com/")
print(data.info())
d=decode(data.read(),'gbk')
parser.feed(d)
print(parser.href)
print(parser.keyowrds)
from webparser import WebParser
from query import Quercy
from collections import deque
from urllib.request import urlopen
from _codecs import decode
from sre_parse import Pattern
import html
class Crawler(object):
def __init__(self):
self.urllist=deque()
def addURL(self,*urllist):
for eachurl in urllist:
self.urllist.append(eachurl)
def visit(self,website):
html=self.__open__(website)
parser = WebParser()
parser.feed(html)
key=''
for each in parser.keyowrds:
key += each
key += ','
return parser.href,key
def __open__(self,url):
import copy
import re
data = urlopen(url)
header = str(data.info())
regex = re.compile('charset=(\w*)')
result= regex.search(header)
charset = 'utf-8'
if result != None:
charset=result.group(1)
html=decode(data.read(),charset)
return html
def __popurl__(self):
if len(self.urllist)==0:
exit(0)
return self.urllist.popleft()
def __puturl__(self,urllist):
for url in urllist:
self.urllist.append(url)
def run(self):
while True:
url=self.__popurl__()
q = Quercy()
times = q.times(url)
if times==0:
urllist,key =self.visit(url)
self.__puturl__(urllist)
q.restore(url,key)
c=Crawler()
c.addURL('http://www.taobao.com/')
c.run()
只有三个类,Quercy负责跟MYSQL通信, WebParser负责解释网页,Crawler负责广度优先遍历,
在淘宝上乱爬,把抓取到的网页及其关键词存入数据库。广度优先
已发现但未抓取的则留在内存。当增长到一定规模时就直接卡死
效率嘛。。。。继续研究。。。