- 首先通过pip install builtwith安装builtwith
C:\Users\Administrator>pip install builtwith
Collecting builtwith
Downloading builtwith-1.3.2.tar.gz
Installing collected packages: builtwith
Running setup.py install for builtwith ... done
Successfully installed builtwith-1.3.2
- 在pycharm中新建工程并输入下面测试代码
import builtwith
tech_used = builtwith.parse('http://www.baidu.com')
print(tech_used)
运行会得到下面的错误:
C:\Users\Administrator\AppData\Local\Programs\Python\Python36\python.exe F:/python/first/FirstPy
Traceback (most recent call last):
File "F:/python/first/FirstPy", line 1, in <module>
import builtwith
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\builtwith\__init__.py", line 43
except Exception, e:
^
SyntaxError: invalid syntax
Process finished with exit code 1
原因是builtwith是基于2.x版本的,需要修改几个地方,在pycharm出错信息中双击出错文件,进行修改,主要修改下面三种:
1. Python2中的 “Exception ,e”的写法已经不支持,需要修改为“Exception as e”。
2. Python2中print后的表达式在Python3中都需要用括号括起来(只要加括号不用修改其他的东西)。
3. builtwith中使用的是Python2中的urllib2工具包,这个工具包在Python3中是不存在的,需要修改urllib2相关的代码。
1和2容易修改,下面主要针对第3点进行修改:
首先将import urllib2替换为下面的代码:
import urllib.request
import urllib.error
再次运行项目,遇到下面错误:
C:\Users\Administrator\AppData\Local\Programs\Python\Python36\python.exe F:/python/first/FirstPy
Traceback (most recent call last):
File "F:/python/first/FirstPy", line 3, in <module>
builtwith.parse('http://www.baidu.com')
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\builtwith\__init__.py", line 62, in builtwith
if contains(html, snippet):
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\builtwith\__init__.py", line 105, in contains
return re.compile(regex.split('\\;')[0], flags=re.IGNORECASE).search(v)
TypeError: cannot use a string pattern on a bytes-like object
Process finished with exit code 1
这是因为urllib返回的数据格式已经发生了改变,需要进行转码,将下面的代码:
if html is None:
html = response.read()
修改为-》
if html is None:
html = response.read()
html = html.decode('utf-8')
但是如果把网站换成 ‘www.163.com’,运行再次报错如下:
C:\Users\Administrator\AppData\Local\Programs\Python\Python36\python.exe F:/python/first/FirstPy
Error: 'utf-8' codec can't decode byte 0xcd in position 500: invalid continuation byte
Traceback (most recent call last):
File "F:/python/first/FirstPy", line 2, in <module>
tech_used = builtwith.parse('http://www.163.com')
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\builtwith\__init__.py", line 63, in builtwith
if contains(html, snippet):
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\builtwith\__init__.py", line 106, in contains
return re.compile(regex.split('\\;')[0], flags=re.IGNORECASE).search(v)
TypeError: cannot use a string pattern on a bytes-like object
Process finished with exit code 1
所以不同的网站需要用不同的解码方式么?下面介绍一种判别网站编码格式的方法。
我们需要安装一个叫chardet的工具包,如下:
C:\Users\Administrator>pip install chardet
Collecting chardet
Downloading chardet-2.3.0-py2.py3-none-any.whl (180kB)
100% |████████████████████████████████| 184kB 616kB/s
Installing collected packages: chardet
Successfully installed chardet-2.3.0
C:\Users\Administrator>
将builtwith对应的代码做下面修改,记得 import chardet!!!!:
encode_type = chardet.detect(html)
if encode_type['encoding'] == 'utf-8':
html = html.decode('utf-8')
else:
html = html.decode('gbk')
然后遇到的最大问题,最后这段代码不管怎么写结果都显示缩进错误,1个小时的各种尝试后终于发现问题所在写if xxxx:直接敲回车换行就会出现缩进和空格的错误,要手动删掉前面的缩进手打空格对齐才可以。
最后给懒人附上修改完成的_init.py:
import sys
import os
import re
import json
import urllib.request
import urllib.error
import chardet
def builtwith(url, headers=None, html=None, user_agent='builtwith'):
"""Detect the technology used to build a website
>>> builtwith('http://wordpress.com')
{u'blogs': [u'PHP', u'WordPress'], u'font-scripts': [u'Google Font API'], u'web-servers': [u'Nginx'], u'javascript-frameworks': [u'Modernizr'], u'programming-languages': [u'PHP'], u'cms': [u'WordPress']}
>>> builtwith('http://webscraping.com')
{u'javascript-frameworks': [u'jQuery', u'Modernizr'], u'web-frameworks': [u'Twitter Bootstrap'], u'web-servers': [u'Nginx']}
>>> builtwith('http://microsoft.com')
{u'javascript-frameworks': [u'jQuery'], u'mobile-frameworks': [u'jQuery Mobile'], u'operating-systems': [u'Windows Server'], u'web-servers': [u'IIS']}
>>> builtwith('http://jquery.com')
{u'cdn': [u'CloudFlare'], u'web-servers': [u'Nginx'], u'javascript-frameworks': [u'jQuery', u'Modernizr'], u'programming-languages': [u'PHP'], u'cms': [u'WordPress'], u'blogs': [u'PHP', u'WordPress']}
>>> builtwith('http://joomla.org')
{u'font-scripts': [u'Google Font API'], u'miscellaneous': [u'Gravatar'], u'web-servers': [u'LiteSpeed'], u'javascript-frameworks': [u'jQuery'], u'programming-languages': [u'PHP'], u'web-frameworks': [u'Twitter Bootstrap'], u'cms': [u'Joomla'], u'video-players': [u'YouTube']}
"""
techs = {}
# check URL
for app_name, app_spec in data['apps'].items():
if 'url' in app_spec:
if contains(url, app_spec['url']):
add_app(techs, app_name, app_spec)
# download content
if None in (headers, html):
try:
request = urllib.request.Request(url, None, {'User-Agent': user_agent})
if html:
# already have HTML so just need to make HEAD request for headers
request.get_method = lambda : 'HEAD'
response = urllib.request.urlopen(request)
if headers is None:
headers = response.headers
if html is None:
html = response.read()
encode_type=chardet.detect(html)
if encode_type['encoding']=='utf-8':
html=html.decode('utf-8')
else:
html=html.decode('gbk')
except Exception as e:
print ('Error:', e)
request = None
# check headers
if headers:
for app_name, app_spec in data['apps'].items():
if 'headers' in app_spec:
if contains_dict(headers, app_spec['headers']):
add_app(techs, app_name, app_spec)
# check html
if html:
for app_name, app_spec in data['apps'].items():
for key in 'html', 'script':
snippets = app_spec.get(key, [])
if not isinstance(snippets, list):
snippets = [snippets]
for snippet in snippets:
if contains(html, snippet):
add_app(techs, app_name, app_spec)
break
# check meta
# XXX add proper meta data parsing
metas = dict(re.compile('<meta[^>]*?name=[\'"]([^>]*?)[\'"][^>]*?content=[\'"]([^>]*?)[\'"][^>]*?>', re.IGNORECASE).findall(html))
for app_name, app_spec in data['apps'].items():
for name, content in app_spec.get('meta', {}).items():
if name in metas:
if contains(metas[name], content):
add_app(techs, app_name, app_spec)
break
return techs
parse = builtwith
def add_app(techs, app_name, app_spec):
"""Add this app to technology
"""
for category in get_categories(app_spec):
if category not in techs:
techs[category] = []
if app_name not in techs[category]:
techs[category].append(app_name)
implies = app_spec.get('implies', [])
if not isinstance(implies, list):
implies = [implies]
for app_name in implies:
add_app(techs, app_name, data['apps'][app_name])
def get_categories(app_spec):
"""Return category names for this app_spec
"""
return [data['categories'][str(c_id)] for c_id in app_spec['cats']]
def contains(v, regex):
"""Removes meta data from regex then checks for a regex match
"""
return re.compile(regex.split('\\;')[0], flags=re.IGNORECASE).search(v)
def contains_dict(d1, d2):
"""Takes 2 dictionaries
Returns True if d1 contains all items in d2"""
for k2, v2 in d2.items():
v1 = d1.get(k2)
if v1:
if not contains(v1, v2):
return False
else:
return False
return True
def load_apps(filename='apps.json.py'):
"""Load apps from Wappalyzer JSON (https://github.com/ElbertF/Wappalyzer)
"""
# get the path of this filename relative to the current script
# XXX add support to download update
filename = os.path.join(os.getcwd(), os.path.dirname(__file__), filename)
return json.load(open(filename))
data = load_apps()
if __name__ == '__main__':
urls = sys.argv[1:]
if urls:
for url in urls:
results = builtwith(url)
for result in sorted(results.items()):
print ('%s: %s' % result)
else:
print ('Usage: %s url1 [url2 url3 ...]' % sys.argv[0])