python - 爬虫递归抓取网站信息 rul、title、desc
实现思路:分两部分实现,1》抓取网站所有的 URL ;2》通过 URL 就可以方便的拉取任何内容;
下面给出抓取所有 URL 的思路和 code,
其实,实现比较简单只需要一个递归就搞定了,注意一下抓取的 深度和回归条件,必定每个页面的 url 会有很多重复的;
#!/usr/bin/env python3
# coding=utf-8
import codecs
import os
import random
import re
import sys
import time,datetime
import json
import urllib
from urllib import parse
from urllib.parse import urlparse, urljoin
import requests
import hashlib
import threading
import sqlite3
from lxml import etree
from pip._vendor.urllib3 import request
from pyquery import PyQuery as pq
# from functools import reduce
# from pybloom_live.pybloom import (BloomFilter,
# ScalableBloomFilter,
# make_hashfuncs)
'''
抓取首页下 n 级的所有 url
'''
_debug = False
# _debug = True
_debugCount = 10
_maxPullDepth = 3 # 抓取深度
_maxSpiderCount = 5000 #最大爬取页面数
_spiderTotal = 0
_pullSleepSecond = 1 #抓取间隔时间
_baseDir = "/Users/site/seo-spider/" # 基础目录
_siteDir = "com_xxxx_www" # 站点目录
_protocol = 'https'
_urlBase = 'https://www.xxxx.com' #基础 URL
_siteDomain = "xxxx.com" #域名
_logFileFullName = ''
_spiderUrls = set()
_urlAll = set()
_headers = [
"Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19",
"Mozilla/5.0 (Linux; U; Android 4.0.4; en-gb; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30",
"Mozilla/5.0 (Linux; U; Android 2.2; en-gb; GT-P1000 Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Android; Mobile; rv:14.0) Gecko/14.0 Firefox/14.0",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36",
"Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19",
"Mozilla/5.0 (iPad; CPU OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3",
"Mozilla/5.0 (iPod; U; CPU like Mac OS X; en) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/3A101a Safari/419.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" ,
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
def getHeader():
userAgent = random.choice(_headers)
return {'User-Agent':userAgent}
class UrlInfo:
def __init__(self,text='',url=''):
self.text = text
self.url = url
text = ''
url = ''
def getHtml(url):
html = None
try:
if not url.startswith('http'):
url = _protocol+"://"+url
resp = requests.get(url,headers=getHeader()) # 请求
# 解析请求
if not resp:
print("无响应 code=%s url=%s"%(resp.status_code,url))
return
_spiderUrls.add(url)
html = resp.content #resp.text 有可能中文乱码
if isinstance(resp.content,bytes):
html = html.decode(encoding="utf-8")
print('resp.len = ', len(html))
except Exception as ex:
print('url=% request err:%s'%(url,str(ex)))
return html
def urlClear(url):
url = re.sub(r'#.+|\?.+','',url) #去掉查询参数
url = re.sub(r'[/\s]$','',url) #去掉尾部无效符号
return url
def getUrls(baseUrl,html):
urls = set()
doc = pq(html)
elms = doc('a')
outAddr = []
for a in elms:
#text = doc(a).text()
url = doc(a).attr('href')
if not url:
continue
url = parse.urljoin(baseUrl,url)
#出站url
if _siteDomain not in url:
outAddr.append(url)
# print('外站地址 %s\tbase=%s'%(url,baseUrl))
continue
url = urlClear(url)
#域名无效
ret = re.findall(r'\w\.[a-z0-9]+\.\w+',url)
if len(ret) == 0:
print('无效地址',url)
continue
#请求过的不记入
if url in _urlAll:
continue
_urlAll.add(url)
urls.add(url)
print('url=%s\t出站连接 %s 条'%(baseUrl,len(outAddr)))
return urls
def spider(url,depth=0):
global _spiderTotal
_spiderTotal += 1
#抓取
url = urlClear(url)
log = 'begin\tdepth=%s\turl=%s'%(depth,url)
print(log)
_urlAll.add(url)
html = getHtml(url)
if not html or len(html)==0:
print('html is null')
return
time.sleep(_pullSleepSecond)
urls = getUrls(url,html)
log = 'complet\tdepth=%s\turl=%s\tsubUrls=%s'%(depth,url,len(urls))
print(log)
saveLog(log)
if urls and len(urls) ==0:
print('回归,urls.size=0')
return
# 递归抓取
n = 0
for addr in urls:
n += 1
d = (depth+1)
if d > _maxPullDepth:
print('回归,深度 %s > %s'%(depth,_maxPullDepth))
return
if _spiderTotal > _maxSpiderCount:
print('回归,最大抓取量 %s > %s'%(_spiderTotal,_maxSpiderCount))
return
print('start %s/%s\tbase=%s\turl=%s'%(n,len(urls),url,addr))
spider(addr,d)
pass
def saveLog(msg):
with open(_logFileFullName,mode='a+',encoding='utf=8') as f:
f.write('%s\r\n'%msg)
def saveUrls():
ts = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
today = time.strftime('%Y%m%d_%H%M',time.localtime(time.time()))
fileFullName = os.path.join(_baseDir,_siteDir,"urls_"+today+".txt")
#写结果
with open(fileFullName,mode='a+',encoding='utf=8') as f:
f.write('时间:%s\r\n总数:%s\r\n'%(ts,len(_urlAll)))
f.write('请求:%s\r\n'%len(_spiderUrls))
f.write('%s\r\n'%('~'*30))
f.write('#已请求\r\n')
lst = sorted(_spiderUrls)
for row in lst:
f.write('%s\r\n'%row)
f.write('%s\r\n'%('~'*30))
f.write('#所有地址\r\n')
lst = sorted(_urlAll)
for row in lst:
f.write('%s\r\n'%row)
print('saveTree to %s'%fileFullName)
#统计方法执行耗时
def time_watcher(fn):
def _wrapper(*args, **kwargs):
start = time.perf_counter()
fn(*args, **kwargs) # 方法调用
cost = time.perf_counter() - start
log = "耗时:fun=%s\tcost=%.4fs" % (fn.__name__, cost)
saveLog(log)
print(log)
return log
return _wrapper
@time_watcher
def main():
global _logFileFullName
today = time.strftime('%Y%m%d_%H%M',time.localtime(time.time()))
_logFileFullName = os.path.join(_baseDir,_siteDir,"log_"+today+".txt")
url = 'https://www.xxxx.com/' # xxxx
spider(url) #递归抓取
print('抓取结束\tallUrls=%s\tspiderUrls=%s'%(len(_urlAll),len(_spiderUrls)))
saveUrls() #写文件
def test():
base = 'http://www.xxxx.com/user/khanacademy'
print(parse.urljoin(base,'a.html'))
print(parse.urljoin(base,'./a.html'))
print(parse.urljoin(base,'../a.html'))
print(parse.urljoin(base,'/a.html'))
print('xxxx.com' in _siteDomain)
print('xxx.com' in _siteDomain)
html = getHtml('https://ds.xxxx.com/vip/jp')
pass
if __name__ == '__main__':
print('\r\n%s\r\n'%('~'*60))
# test()
main()
pass
'''
>>> f = open('test.txt', 'w',encoding='utf-8') # 若是'wb'就表示写二进制文件
>>> f.write('Hello, world!')
>>> f.close()
python文件对象提供了两个“写”方法: write() 和 writelines()。
write()方法和read()、readline()方法对应,是将字符串写入到文件中。
writelines()方法和readlines()方法对应,也是针对列表的操作。它接收一个字符串列表作为参数,将他们写入到文件中,换行符不会自动的加入,因此,需要显式的加入换行符。
关于open()的mode参数:
'r':读
'w':写
'a':追加
'r+' == r+w(可读可写,文件若不存在就报错(IOError))
'w+' == w+r(可读可写,文件若不存在就创建)
'a+' ==a+r(可追加可写,文件若不存在就创建)
对应的,如果是二进制文件,就都加一个b就好啦:
'rb' 'wb' 'ab' 'rb+' 'wb+' 'ab+'
'''