54.使用python做一个简单的网络爬虫

最新推荐文章于 2024-11-13 17:25:30 发布

weixin_33701617

最新推荐文章于 2024-11-13 17:25:30 发布

阅读量140

点赞数

文章标签：爬虫数据库 python

原文链接：https://my.oschina.net/quanpower/blog/152015

版权

2019独角兽企业重金招聘Python工程师标准>>>

概述：

这是一个简单的爬虫，作用也很简单：给定一个网址，抓取这个网址的页面，然后从中提取满足要求的url地址，把这些地址放入队列中，当把给定的网页抓取完毕后，就把队列中的网址作为参数，程序再次去抓取这个页面的数据。直到达到一定的深度（由参数指定）后停止。程序将抓取的网页数据保存在本地。我使用的是 mysql数据库。下面正是开始。

建立数据库：

启动mysql ，建立一个database

createdatabasespcharactersetutf8;

然后建立一个表，这个表包含三个字段，一个保存url，一个保存原始的html代码，还有一个保存去掉html标签后的数据。之所以还要第三个是为了让以后做搜索的时候效率能高一点。

use sp;

createtablewebdata (url longtext,html longtext,puredata longtext);

数据库准备好以后，就开始写代码了。

python程序：

程序我就不做过多说明了，程序中关键部分有注释。程序的参数说明一下：

-u 要抓取的网址

-d 抓取深度，顺着链接爬多少层页面。页面每多一层，数量几何倍增长。默认为2

-t 并发线程数。默认为10线程

-o timeout值。urlopen的timeout阀值。默认为20秒

-l 指定日志文件的路径和名字，默认为当前路径，名为logSpider.log

-v 指定日志的记录详细程度，有三个参数，默认为normal

simple 只记录错误信息

normal 除了错误信息，还记录一些程序运行过程中的状态信息

all 所有的信息，以及爬过的url网址都记录在内

关于timeout要说明一下：各系统timeout的默认值

BSD	75 seconds
Linux	189 seconds
Solaris	225 seconds
Windows XP	21 seconds

对mysql配置文件做修改：

我在做实验的时候发现，如果抓取深度为2，那么程序可以顺利的运行。但把深度调为3的时候，就会出现2006 - MySQL server has gone away错误，我按照网上的方法，修改了mysql配置文件后，就解决了这个问题。方法是将配置文件中 max_allowed_packet = 1M 修改为 max_allowed_packet = 16M

我以此参数运行程序：

python spider.py -u http://www.chinaunix.net -d 3 -t 15 -o 10

程序运行了26分钟，成功抓取了4346个页面，产生了35KB的日志。程序平均每秒能抓取2.8个页面。日志中最多的记录就是某某网址无法打开。

好了，下面上代码：

#-*-coding:utf-8-*-

fromreimportsearch

importurllib2

importMySQLdb

fromBeautifulSoupimportBeautifulSoup

importthreading

fromdatetimeimportdatetime

fromoptparseimportOptionParser

importsys

importlogging

importsocket

fromurlparseimporturlparse

importhttplib

URLS={}

lock=threading.Lock()

classnewThread(threading.Thread):

def__init__(self,level,url,db):

threading.Thread.__init__(self)

self.level=level

self.url=url

self.db=db

defrun(self):

globallock

globallog

foriinself.url:

log.debug('%s:%s'%(datetime.now(),i))

printi

temp,html,data=getURL(i)

#由于无法打开此url，超时，返回的状态码不是200，

#弃掉此url，重新开始循环

ifnottemp:

continue

#获取锁，让此线程安全的更新数据

iflock.acquire():

self.db.save(i,html,data)

#所有线程将收集到的url存入URLS列表，

#然后在主线程中将URL中重复的url删除。

URLS[self.level].extend(temp)

lock.release()

classsaveData():

def__init__(self):

self.db=MySQLdb.connect(user='root',db='sp',unix_socket='/tmp/mysql.sock')

self.cur=self.db.cursor()

self.cur.execute('delete from webdata')

self.commit()

log.info('%s:Connect database success'%datetime.now())

defsave(self,url,html,pureData):

globallog

SQL='''insert into webdata values('%s','%s','%s')'''%(url,html,pureData)

try:

self.cur.execute(SQL)

except(MySQLdb.ProgrammingError,MySQLdb.OperationalError),e:

log.error('%s:%s'%(datetime.now(),e))

return

self.commit()

defcommit(self):

self.db.commit()

defclose(self):

self.db.close()

defgetURL(url):

URLS=[]

globallog

globalsource

globaldomainName

try:

page=urllib2.urlopen(url)

except(urllib2.URLError,httplib.BadStatusLine):

log.error('%s:URL CAN NOT OPEN----%s'%(datetime.now(),url))

return('','','')

else:

ifpage.code==200:

try:

html=page.read().decode('gbk','ignore').encode('utf-8')

except:

log.error('%s:TIME OUT----%s'%(datetime.now(),url))

print'TIME OUT'

return('','','')

else:

log.error('%s:RESPONSE CODE IS NOT 200----%s'%(datetime.now(),url))

return('','','')

html=html.replace("'",'"')

#获取去掉HTML元素后的数据

try:

pureData=''.join(BeautifulSoup(html).findAll(text=True)).encode('utf-8')

exceptUnicodeEncodeError:

pureData=html

#下面的代码用于在网页中寻找符合条件的url地址

rawHtml=html.split('\n')

foriinrawHtml:

times=i.count('')

iftimes:

foryinrange(times):

pos=i.find('')

ifpos!=-1:

#在网页中寻找a标记，提取其中的链接，

#链接有两种形式的，一种双引号，一种单引号

newURL=search('<a href=".+"',i[:pos])

ifnewURLisnotNone:

newURL=newURL.group().split(' ')[1][6:-1]

if'">'innewURL:

newURL=search('.+">',newURL)

ifnewURLisNone:

continue

newURL=newURL.group()[:-2]

#若地址为空，则进入下一个循环

ifnotnewURL:

continue

#如果是相对地址，需要转为绝对地址

ifnotnewURL.startswith('http'):

ifnewURL[0]=='/':

newURL=source+newURL

else:

newURL=source+'/'+newURL

ifdomainNamenotinnewURLornewURLinURLSornewURL==urlornewURL==url+'/':

continue

URLS.append(newURL)

i=i[pos+4:]

return(URLS,html,pureData)

if__name__=='__main__':

USAGE='''

spider -u [url] -d [num] -t [num] -o [secs] -l [filename] -v [level]

-u: url of a websit

-d: the deeps of the spider will get into.default is 2

-t: how many threads work at the same time.default is 10

-o: url request timeout.default is 20 secs

-l: assign the logfile name and location.default name is 'logSpider.log'

-v: values are 'quiet' 'normal' 'all'.default is 'normal'

'simple'---- only log the error message

'normal'---- error message and some addtion message

'all' ---- not only message ,but also urls will be logged.

Examples:

spider -u http://www.chinaunix.net -t 16 -v normal

'''

LEVELS={'simple':logging.WARNING,

'normal':logging.INFO,

'all':logging.DEBUG}

opt=OptionParser(USAGE)

opt.add_option('-u',type='string',dest='url')

opt.add_option('-d',type='int',dest='level',default=2)

opt.add_option('-t',type='int',dest='nums',default=10)

opt.add_option('-o',type='int',dest='out',default=20)

opt.add_option('-l',type='string',dest='name',default='logSpider.log')

opt.add_option('-v',type='string',dest='logType',default='normal')

options,args=opt.parse_args(sys.argv)

source=options.url

level=options.level

threadNums=options.nums

timeout=options.out

logfile=options.name

logType=options.logType

ifnotsourceorlevel<0orthreadNums<1ortimeout<1orlogTypenotinLEVELS.keys():

printopt.print_help()

sys.exit(1)

ifnotsource.startswith('http://'):

source='http://'+source

ifsource.endswith('/'):

source=source[:-1]

domainName=urlparse(source)[1].split('.')[-2]

ifdomainNamein['com','edu','net','org','gov','info','cn']:

domainName=urlparse(source)[1].split('.')[-3]

socket.setdefaulttimeout(timeout)

log=logging.getLogger()

handler=logging.FileHandler(logfile)

log.addHandler(handler)

log.setLevel(LEVELS[logType])

startTime=datetime.now()

log.info('Started at %s'%startTime)

subURLS={}

threads=[]

foriinrange(level+1):

URLS[i]=[]

#初始化-链接数据库

db=saveData()

#得到首页内的url

URLS[0],html,pureData=getURL(source)

ifnotURLS[0]:

log.error('cannot open %s'%source)

print'cannot open '+source

sys.exit(1)

db.save(source,html,pureData)

forleinrange(level):

#根据线程数将当前的URLS大列表切割成小的列表

nowL='-------------level %d------------'%(le+1)

printnowL

log.info(nowL)

preNums=len(URLS[le])/threadNums

foriinrange(threadNums):

temp=URLS[le][:preNums]

ifi==threadNums-1:

subURLS[i]=URLS[le]

else:

subURLS[i]=temp

URLS[le]=URLS[le][preNums:]

#将线程加入线程池，并启动。首先清空线程池

threads=threads[0:0]

foriinrange(threadNums):

t=newThread(le+1,subURLS[i],db)

t.setDaemon(True)

threads.append(t)

foriinthreads:

i.start()

#等待所有线程结束

foriinthreads:

i.join()

nowLevel=le+1

#将列表中相同的url去除

URLS[nowLevel]=list(set(URLS[nowLevel]))

foriinrange(nowLevel):

forurlinURLS[i]:

ifurlinURLS[nowLevel]:

URLS[nowLevel].remove(url)

#写入数据库

# db.commit()

db.close()

endTime=datetime.now()

log.info('Ended at %s'%endTime)

log.info('Takes %s'%(endTime-startTime))

搜索

有了本地存储的数据后，就可以对其中的数据进行搜索。其实搜索引擎是如何根据关键字来检索互联网的，这个我并不清楚。我做这个仅仅是一个演示。如果还记得我前面说的数据库表中的三个字段的话，那这段程序就不用我解释了。程序将输入的词在puredata中检索，若检索到，就输出对于的url。

importMySQLdb

db=MySQLdb.connect(user='root',db='sp',unix_socket='/tmp/mysql.sock')

cur=db.cursor()

nums=cur.execute('select * from webdata')

print'%d items'%nums

x=cur.fetchall()

print'input something to search,"exit" to exit'

whileTrue:

key=raw_input('>')

ifkey=='exit':

break

foriinrange(nums):

ifkeyinx[i][2]:

printx[i][0]

print'search finished'

db.close()

最后给大家上一张搜索结果的截图：

转载于:https://my.oschina.net/quanpower/blog/152015

weixin_33701617

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫