python实现爬虫
最近由于参加学校举办短时速学python的比赛,学习了一遍python这门语言,原来一直认为Java语言是最牛逼的,现在发现python也有它的可取之处,它开发快,语言简洁,对于数组的处理,让我发现利用它开发一些简单的程序真的比java快^^
下面,记录一下我利用python实现爬虫,获取百度文库词条其中包含”python”信息的样例(技术Python、Mysql)
1、爬虫架构,以及原理
爬虫重要的架构有三个,分别是URL管理器、网页下载器、网页解析器,还有一个调度器、数据输出器
URL管理器:管理所有的URL,负责取出URL给网页下载器,并将该URL设定为以爬取
网页下载器:将互联网对应的网页下载到本地
网页解析器:解析网页中重要的数据信息(在本样例中为词条信息),并且从该网页中获取其他符合要求的URL,存入Mysql,以便URL管理器取
调度器:类似与java中的main方法,相当于开启爬虫的入口,它负责初始化第一个入口URL(地址),利用while循环依次调用URL管理器、网页下载器、网页解析器完成相关功能。
数据输出器:将得到数据输出
如下图:
2、代码框架
1、利用Mysql数据库,
数据库表baike_spider,账户:root,密码:0203
CREATE TABLE `baike_spider` (
`webSite` varchar(255) DEFAULT NULL,
`isCraw` int(1) DEFAULT '0',
`title` varchar(255) DEFAULT NULL,
`cont` text,
KEY `webSide` (`webSite`) USING HASH
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
2、源程序框架
3、爬取结果展示:
尴尬,显示的python就两三个。。
不过没关系,肯定在没显示出来的地方^_^.
- 3、调度器
# coding=utf-8
# import baike_sider
import url_manager, html_donloader, html_parse, html_outputer
import sys
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
reload(sys)
sys.setdefaultencoding(default_encoding)
class SpiderMain(object):
def __init__(self):
# 初始化四个管理器
self.urls = url_manager.UrlManager()
self.donloader = html_donloader.HtmlDonload()
self.parse = html_parse.HtmlParse()
self.outputer = html_outputer.HtmlOutputer()
def craw(self, root_url):
# 初始化urls管理器,赋初值
self.urls.init_url(root_url)
# 计算一个count来存储爬虫收集的网站个数
count = 0
# 当url管理器中还存在未被搜刮的网站,循环继续
while self.urls.has_new_url():
try:
# 从URL管理器中获取一个新的url地址
new_url = self.urls.get_new_url()
# 利用网站下载器将其下载下来
cont = self.donloader.donload(new_url)
# 解析器解析cont网站,得到新的urls,和新的数据
urls, new_data = self.parse.parse(new_url, cont)
# 将新的地址存入URL管理器
self.urls.add_new_urls(urls)
# 输出管理器收集这些新信息
if new_data is not None:
self.outputer.collect_data(new_url, new_data)
# 如果爬虫搜刮的网站个数到达1000,停止运行
print "爬完%s,得到%d个新网站,得到信息%s,%s" % (new_url,len(urls),new_data.values()[0],new_data.values()[1])
print "正在爬第%d个网站" % count
if count == 1000:
break
# 统计网站个数加一
count += 1
except Exception,value:
print "craw error :",value
# 返回输出管理器
# return self.outputer
print "craw完毕"
pass
__name__ = "_main_"
if __name__ == "_main_":
root_url = "http://baike.baidu.com/item/Python"
obj_spider = SpiderMain()
obj_spider.craw(root_url)
- 4、URL管理器
# coding=utf-8
import MySQLdb as mdb
# try:
# cursor = self.db.cursor()
#
# self.db.commit()
# except Exception,value:
# self.db.rollback()
# print "URLManager.__init__url: ",value
# finally:
# cursor.close()
class UrlManager(object):
def __init__(self):
self.db = mdb.connect("localhost","root","0203","bigData",charset="utf8")
cursor = self.db.cursor()
delete_sql = '''drop table if exists baike_spider'''
create_sql = '''create table if not exists baike_spider(
webSite varchar(255),
isCraw int(1) default '0',
title varchar(255),
cont text,
KEY `webSide` (`webSite`) USING HASH
)'''
try:
cursor.execute(delete_sql)
cursor.execute(create_sql)
cursor.execute("SET NAMES UTF8")
self.db.commit()
except Exception,value:
print "URLManager.__init__Error: ",value
self.db.rollback()
finally:
cursor.close()
pass
def init_url(self, root_url):
try:
cursor = self.db.cursor()
cursor.execute("SET NAMES UTF8")
insert_sql = '''insert into baike_spider(webSite) values('%s')''' % root_url
cursor.execute(insert_sql)
self.db.commit()
except Exception,value:
self.db.rollback()
print "URLManager.__init__url: ",value
finally:
cursor.close()
pass
def has_new_url(self):
new = 0
try:
cursor = self.db.cursor()
cursor.execute("SET NAMES UTF8")
select_sql = '''select isCraw from baike_spider where isCraw=0 limit 1'''
new = cursor.execute(select_sql)
except Exception,value:
print "URLManager.has_new_url: ",value
finally:
cursor.close()
# print 'new=',new
return new
pass
def get_new_url(self):
url = ""
try:
cursor = self.db.cursor()
cursor.execute("SET NAMES UTF8")
select_sql = '''select * from baike_spider where isCraw=0 limit 1'''
cursor.execute(select_sql)
url = cursor.fetchone()[0]
update_sql = '''update baike_spider set isCraw=1 where webSite='%s' '''
cursor.execute(update_sql % url)
self.db.commit()
except Exception,value:
self.db.rollback()
print "URLManager.has_new_url: ",value
finally:
cursor.close()
return url
def add_new_urls(self,urls):
is_exist = '''select isCraw from baike_spider where webSite='%s' '''
insert_sql = '''insert into baike_spider(webSite) values('%s')'''
try:
cursor = self.db.cursor()
cursor.execute("SET NAMES UTF8")
for url in urls:
flag = cursor.execute(is_exist % url)
if flag:continue
else:
cursor.execute(insert_sql % url)
self.db.commit()
except Exception,value:
print "URLManager.add_new_urls: ",value
self.db.rollback()
finally:
cursor.close()
pass
# urlManage = UrlManager()
# urlManage.has_new_url()
# urls = ["http://www.baidu.com","http://www.baidu.com4","http://www.baidu.com2","http://www.baidu.com1","http://www.baidu.com3"]
# print urlManage.add_new_urls(urls)
- 5、网页下载器
# coding=utf-8
import urllib2
class HtmlDonload():
def __init__(self):
pass
def donload(self, url):
cont = ""
try:
response = urllib2.urlopen(url)
if response.getcode()==200:
#读取网页内容
cont = response.read()
#输出内容
# print cont
except Exception,value:
print "HtmlDonload(),Error",value
return cont
pass
# HtmlDonload().donload("http://www.baidu.com")
- 6、网页解析器
# coding=utf-8
import re
import urlparse
from bs4 import BeautifulSoup
class HtmlParse():
def __init__(self):
pass
def _get_new_urls(self, page_url, soup):
new_urls = set()
links = soup.find_all(name='a', href=re.compile(r"/item/"))
for link in links:
new_url = link['href']
new_full_url = urlparse.urljoin(page_url, new_url)
# print "new_full_url = ",new_full_url
new_urls.add(new_full_url)
return new_urls
def _get_new_data(self, url, soup):
res_data = {}
title_node = soup.find("dd", class_="lemmaWgt-lemmaTitle-title").find("h1")
res_data["title",] = title_node.get_text()
summary_node = soup.find("div", class_="lemma-summary")
res_data["summary"] = summary_node.get_text()
# print "res_data = ", res_data
return res_data
def parse(self, url, cont):
if cont is None or url is None:
return
# 将cont传入生成一个beautifulSoup对象
soup = BeautifulSoup(cont, 'html.parser', from_encoding='utf-8')
new_urls = self._get_new_urls(url, soup)
new_data = self._get_new_data(url, soup)
return new_urls, new_data
# parse = HtmlParse()
# parse.parse("baidu.html",open("hello.html"))
- 7、数据输出器
# coding=utf-8
import MySQLdb as mdb
class HtmlOutputer():
def __init__(self):
self.db = mdb.connect("localhost","root","0203","bigData",charset="utf8")
pass
def collect_data(self, url, new_data):
try:
cursor = self.db.cursor()
cursor.execute("SET NAMES UTF8")
insert_sql = '''update baike_spider set title='%s',cont='%s' where webSite='%s' '''
data = new_data.values()
cursor.execute(insert_sql % (data[0],data[1],url))
self.db.commit()
except Exception,value:
self.db.rollback()
print "HtmlOutputer.collect_data: ",value
finally:
cursor.close()
pass
def print_data(self):
print 123
try:
cursor = self.db.cursor()
cursor.execute("SET NAMES UTF8")
insert_sql = '''select * from baike_spider where isCraw=1 '''
cursor.execute(insert_sql)
results = cursor.fetchall()
for result in results:
print result[2],result[3]
self.db.commit()
except Exception,value:
self.db.rollback()
print "HtmlOutputer.collect_data: ",value
finally:
cursor.close()
pass