爬虫
相关工具包
builtwith 1.3.3
python-whois 0.7.0
httpie 0.9.9
beautifulsoup4 4.6.0
bs4 0.0.1
lxml 4.2.1
pymongo 3.6.1
PyMySQL 0.8.1
redis 2.10.6
requests 2.18.4
robobrowser 0.5.3
selenium 3.12.0
Pillow 5.1.0
pytesseract 0.2.2
常用的user-agent
常见的User Agent
1.Android
Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19
Mozilla/5.0 (Linux; U; Android 4.0.4; en-gb; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30
Mozilla/5.0 (Linux; U; Android 2.2; en-gb; GT-P1000 Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1
2.Firefox
Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0
Mozilla/5.0 (Android; Mobile; rv:14.0) Gecko/14.0 Firefox/14.0
3.Google Chrome
Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36
Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19
4.iOS
Mozilla/5.0 (iPad; CPU OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3
Mozilla/5.0 (iPod; U; CPU like Mac OS X; en) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/3A101a Safari/419.3
BeautifulSoup标签选择
知识点:bs4-re
from bs4 import BeautifulSoup
import re
def main():
html = "……"
soup = BeautifulSoup(html, 'lxml')
print(soup.title)
print(soup.body.h1)
print(soup.p)
print(soup.body.p.text)
print(soup.body.p.contents)
for p_child in soup.body.p.children:
print(p_child)
print(len([elem for elem in soup.body.children]))
print(len([elem for elem in soup.body.descendants]))
print(soup.findAll(re.compile(r'^h[1-6]')))
print(soup.body.find_all(r'^h'))
print(soup.body.div.find_all(re.compile(r'^h')))
print(soup.find_all(re.compile(r'r$')))
print(soup.find_all('img', {'src': re.compile(r'\./img/\w+.png')}))
print(soup.find_all(lambda x: len(x.attrs) == 2))
print(soup.find_all(foo))
print(soup.find_all('p', {'class': 'foo'}))
for elem in soup.select('a[href]'):
print(elem.attrs['href'])
def foo(elem):
return len(elem.attrs) == 2
if __name__ == '__main__':
main()
简单爬虫实例
知识点:re-urllib-pymysql-ssl
from urllib.error import URLError
from urllib.request import urlopen
import re
import pymysql
import ssl
from pymysql import Error
def decode_page(page_bytes, charsets=('utf-8',)):
page_html = None
for charset in charsets:
try:
page_html = page_bytes.decode(charset)
break
except UnicodeDecodeError:
pass
return page_html
def get_page_html(seed_url, *, retry_times=3, charsets=('utf-8',)):
page_html = None
try:
page_html = decode_page(urlopen(seed_url).read(), charsets)
except URLError:
if retry_times > 0:
return get_page_html(seed_url, retry_times=retry_times - 1,
charsets=charsets)
return page_html
def get_matched_parts(page_html, pattern_str, pattern_ignore_case=re.I):
pattern_regex = re.compile(pattern_str, pattern_ignore_case)
return pattern_regex.findall(page_html) if page_html else []
def start_crawl(seed_url, match_pattern, *, max_depth=-1):
conn = pymysql.connect(host='localhost', port=3306,
database='crawler', user='root',
password='123456', charset='utf8')
try:
with conn.cursor() as cursor:
url_list = [seed_url]
visited_url_list = {seed_url: 0}
while url_list:
current_url = url_list.pop(0)
depth = visited_url_list[current_url]
if depth != max_depth:
page_html = get_page_html(current_url, charsets=('utf-8', 'gbk', 'gb2312'))
links_list = get_matched_parts(page_html, match_pattern)
param_list = []
for link in links_list:
if link not in visited_url_list:
visited_url_list[link] = depth + 1
page_html = get_page_html(link, charsets=('utf-8', 'gbk', 'gb2312'))
headings = get_matched_parts(page_html, r'<h1>(.*)<span')
if headings:
param_list.append((headings[0], link))
cursor.executemany('insert into tb_result values (default, %s, %s)',
param_list)
conn.commit()
except Error:
pass
finally:
conn.close()
def main():
ssl._create_default_https_context = ssl._create_unverified_context
start_crawl('http://sports.sohu.com/nba_a.shtml',
r'<a[^>]+test=a\s[^>]*href=["\'](.*?)["\']',
max_depth=2)
if __name__ == '__main__':
main()
知识点:BeautifulSoup-requests
from bs4 import BeautifulSoup
import requests
import re
def main():
resp = requests.get('http://sports.sohu.com/nba_a.shtml')
html = resp.content.decode('gbk')
bs = BeautifulSoup(html, 'lxml')
for elem in bs.select('a[test]'):
link_url = elem.attrs['href']
resp = requests.get(link_url)
bs_sub = BeautifulSoup(resp.text, 'lxml')
print(re.sub(r'[\r\n]', '', bs_sub.find('h1').text))
if __name__ == '__main__':
main()
from urllib.parse import urljoin
import re
import requests
from bs4 import BeautifulSoup
def main():
headers = {'user-agent': 'Baiduspider'}
proxies = {
'http': 'http://122.114.31.177:808'
}
base_url = 'https://www.zhihu.com/'
seed_url = urljoin(base_url, 'explore')
resp = requests.get(seed_url,
headers=headers,
proxies=proxies)
soup = BeautifulSoup(resp.text, 'lxml')
href_regex = re.compile(r'^/question')
link_set = set()
for a_tag in soup.find_all('a', {'href': href_regex}):
if 'href' in a_tag.attrs:
href = a_tag.attrs['href']
full_url = urljoin(base_url, href)
link_set.add(full_url)
print('Total %d question pages found.' % len(link_set))
if __name__ == '__main__':
main()
知识点: re-urllib-md5-pickle-zlib-redis-ssl
from urllib.error import URLError
from urllib.request import urlopen
import re
import redis
import ssl
import hashlib
import logging
import pickle
import zlib
def decode_page(page_bytes, charsets=('utf-8',)):
page_html = None
for charset in charsets:
try:
page_html = page_bytes.decode(charset)
break
except UnicodeDecodeError:
pass
return page_html
def get_page_html(seed_url, *, retry_times=3, charsets=('utf-8',)):
page_html = None
try:
if seed_url.startswith('http://') or \
seed_url.startswith('https://'):
page_html = decode_page(urlopen(seed_url).read(), charsets)
except URLError as err:
logging.error('[URL]', err)
if retry_times > 0:
return get_page_html(seed_url, retry_times=retry_times - 1,
charsets=charsets)
return page_html
def get_matched_parts(page_html, pattern_str, pattern_ignore_case=re.I):
pattern_regex = re.compile(pattern_str, pattern_ignore_case)
return pattern_regex.findall(page_html) if page_html else []
def start_crawl(seed_url, match_pattern, *, max_depth=-1):
client = redis.Redis(host='1.2.3.4', port=6379, password='1qaz2wsx')
charsets = ('utf-8', 'gbk', 'gb2312')
logging.info('[Redis ping]', client.ping())
url_list = [seed_url]
visited_url_list = {seed_url: 0}
while url_list:
current_url = url_list.pop(0)
depth = visited_url_list[current_url]
if depth != max_depth:
page_html = get_page_html(current_url, charsets=charsets)
links_list = get_matched_parts(page_html, match_pattern)
for link in links_list:
if link not in visited_url_list:
visited_url_list[link] = depth + 1
page_html = get_page_html(link, charsets=charsets)
if page_html:
hasher = hashlib.md5()
hasher.update(link.encode('utf-8'))
zipped_page = zlib.compress(pickle.dumps(page_html))
client.set(hasher.hexdigest(), zipped_page)
def main():
ssl._create_default_https_context = ssl._create_unverified_context
start_crawl('http://sports.sohu.com/nba_a.shtml',
r'<a[^>]+test=a\s[^>]*href=["\'](.*?)["\']',
max_depth=2)
if __name__ == '__main__':
main()
知识点:re-requests-urljoin-bs4-sha1-pickle-zlib-redis
from hashlib import sha1
from urllib.parse import urljoin
import pickle
import re
import requests
import zlib
from bs4 import BeautifulSoup
from redis import Redis
def main():
base_url = 'https://www.zhihu.com/'
seed_url = urljoin(base_url, 'explore')
client = Redis(host='localhost', port=6379, password='111111')
headers = {'user-agent': 'Baiduspider'}
resp = requests.get(seed_url, headers=headers)
soup = BeautifulSoup(resp.text, 'lxml')
href_regex = re.compile(r'^/question')
hasher_proto = sha1()
for a_tag in soup.find_all('a', {'href': href_regex}):
href = a_tag.attrs['href']
full_url = urljoin(base_url, href)
hasher = hasher_proto.copy()
hasher.update(full_url.encode('utf-8'))
field_key = hasher.hexdigest()
if not client.hexists('zhihu', field_key):
html_page = requests.get(full_url, headers=headers).text
zipped_page = zlib.compress(pickle.dumps(html_page))
client.hset('zhihu', field_key, zipped_page)
print('Total %d question pages found.' % client.hlen('zhihu'))
if __name__ == '__main__':
main()
综合爬虫实例
知识点: requests-bs4-sha1-redis-mongodb-threading-urlparse
import logging
import pickle
import zlib
from hashlib import sha1
import pymongo
from enum import Enum, unique
from random import random
from threading import Thread, current_thread, local
from time import sleep
from urllib.parse import urlparse
import redis
import requests
from bs4 import BeautifulSoup
from bson import Binary
class Constans(object):
"""定义一个常量类"""
urser_agent = 'Mozilla/5.0 (Android; Mobile; rv:14.0) Gecko/14.0 Firefox/14.0'
proxies = {
"http": "http://111.183.231.117:61234/",
}
@unique # 表示内容具有唯一性
class SpiderStatus(Enum):
"""定义一个枚举类"""
IDLE = 0
WORKING = 1
def any_thread_alive(spider_threads):
"""判断在所有线程中是否存在还处于工作状态的线程"""
return any([spider_thread.spider.status == SpiderStatus.WORKING for spider_thread in spider_threads])
def decode_html_page(page, charsets):
"""页面解码"""
page_html = None
for charset in charsets:
try:
page_html = page.content.decode(charset)
break
except Exception as e:
logging.error(e)
return page_html
class Retry(object):
"""用类定义的一个装饰器,可以定义执行次数及每次执行间等待的时间"""
def __init__(self, *, retry_times=3, wait_secs=5, errors=(Exception,)):
self.retry_time = retry_times
self.wait_secs = wait_secs
self.errors = errors
def __call__(self, func):
"""魔法方法:自定义装饰器必须写在这个方法下"""
def wapper(*args, **kwargs):
for _ in range(self.retry_time):
try:
return func(*args, **kwargs)
except self.errors as e:
logging.error(e)
sleep(int(self.wait_secs) * (random() + 1))
return None
return wapper
class Spider(object):
"""配置爬虫属性和方法"""
def __init__(self):
"""定义默认工作状态"""
self.status = SpiderStatus.IDLE
@Retry() # 自定义装饰器调用方法,要加()括号。
def fetch(self, current_url, *, user_agent=None, proxies=None, charsets=('gb2312', 'utf-8', 'gbk')):
"""获取页面"""
Tread_name = current_thread().name
print(f'{Tread_name}:{current_url}')
headers = {'user-agent': user_agent} if user_agent else {}
page = requests.get(current_url, headers=headers, proxies=proxies)
return decode_html_page(page, charsets) if page.status_code == 200 else None
def parse(self, html_page, domain='www.geyanw.com'):
"""解析页面中url"""
if html_page:
soup = BeautifulSoup(html_page, 'lxml')
for a_tag in soup.select_one('div[id="p_left"]').select('a[href]'):
parser = urlparse(a_tag.attrs['href'])
scheme = parser.scheme or 'https'
netloc = parser.netloc or domain
if netloc == domain and scheme != 'javascript':
path = parser.path
query = '?' + parser.query if parser.query else ''
full_url = f'{scheme}://{netloc}{path}{query}'
redis_client = thread_local.redis_client
if not redis_client.sismember('visited_url', full_url):
redis_client.rpush('task_list', full_url)
print('full_url:' + full_url)
def extract(self, html_page):
"""获取标题和类容"""
if html_page:
soup = BeautifulSoup(html_page, 'lxml')
title = content = ''
try:
title = soup.select_one('div[id="p_left"]').select_one('div[class="title"]').find('h2').text
except Exception as e:
pass
try:
content_ps = soup.select_one('div[id="p_left"]').select_one('div[class="content"]').find_all('p')
for content_p in content_ps:
content += content_p.text
except Exception as e:
pass
return title, content
def store(self, my_dict):
mongo_db = thread_local.mongo_db
hasher = hash_proto.copy()
hasher.update(my_dict['content'].encode('utf-8'))
doc_id = hasher.hexdigest()
mongo_data_coll = mongo_db[my_dict['current_path']]
if not mongo_data_coll.find_one({'_id': doc_id}):
mongo_data_coll.insert_one(
dict(_id=doc_id, path=my_dict['current_path'], url=my_dict['current_url'], title=my_dict['title'],
content=Binary(zlib.compress(pickle.dumps(my_dict['content'])))))
print('存入mongodb成功')
class SpiderThread(Thread):
"""配置线程"""
def __init__(self, name, spider):
"""定义线程名字和参数"""
super().__init__(name=name, daemon=True)
self.spider = spider
def run(self):
"""线程方法,必须写在run()方法里面"""
redis_client = redis.Redis(host='localhost', port=6379)
mongo_client = pymongo.MongoClient(host='localhost', port=27017)
thread_local.redis_client = redis_client
thread_local.mongo_db = mongo_client.geyanwang
while True:
current_url = redis_client.lpop('task_list')
while not current_url:
self.spider.status = SpiderStatus.IDLE
current_url = redis_client.lpop('task_list')
if current_url:
self.spider.status = SpiderStatus.WORKING
current_url = current_url.decode('utf-8')
if not redis_client.sismember('visited_url', current_url):
redis_client.sadd('visited_url', current_url)
html_page = self.spider.fetch(current_url, user_agent=Constans.urser_agent, proxies=Constans.proxies)
if html_page:
title, content = self.spider.extract(html_page)
current_path = ''
try:
current_path = urlparse(current_url).path.split('/')[1]
except Exception as e:
pass
if current_path and title and content:
my_dict = dict(current_url=current_url,current_path=current_path,title=title,content=content)
self.spider.store(my_dict)
self.spider.parse(html_page)
thread_local = local()
hash_proto = sha1()
def main():
redis_client = redis.Redis(host='localhost', port=6379)
if not redis_client.exists('task_list'):
redis_client.rpush('task_list', 'https://www.geyanw.com/')
spider_threads = [SpiderThread('th-%d' % i, Spider()) for i in range(10)]
for spider_thread in spider_threads:
spider_thread.start()
while redis_client.exists('task_list') or any_thread_alive(spider_threads):
pass
print('Over!')
if __name__ == '__main__':
main()
表单操作
知识点:requests-bs4
import requests
from bs4 import BeautifulSoup
def main():
resp = requests.get('https://github.com/login/')
if resp.status_code != 200:
return
cookies = resp.cookies.get_dict()
soup = BeautifulSoup(resp.text, 'lxml')
utf8_value = soup.select_one('form input[name="utf8"]').attrs['value']
authenticity_token_value = soup.select_one('form input[name="authenticity_token"]').attrs['value']
data = {
'utf8': utf8_value,
'authenticity_token': authenticity_token_value,
'login':'644148993@qq.com',
'password': 'xxxxx'
}
resp = requests.post('https://github.com/session/', data=data, cookies=cookies)
print(resp.text)
if __name__ == '__main__':
main()
知识点:robobrowser
import robobrowser
def main():
b = robobrowser.RoboBrowser(parser='lxml')
b.open('https://github.com/login/')
f = b.get_form(action='/session')
f['login'].value = '644148993@qq.com'
f['password'].value = 'your password'
b.submit_form(f)
for a_tag in b.select('a[href]'):
print(a_tag.attrs['href'])
if __name__ == '__main__':
main()
动态内容抓取
知识点:Selenium(webdriver)-BeautifulSoup
from selenium import webdriver
from bs4 import BeautifulSoup
def main():
driver = webdriver.Chrome()
driver.get('https://v.taobao.com/v/content/live?catetype=704')
soup = BeautifulSoup(driver.page_source, 'lxml')
for img_tag in soup.select('img[src]'):
print(img_tag.attrs['src'])
if __name__ == '__main__':
main()
知识点:Selenium(webdriver-keys)-BeautifulSoup
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.keys import Keys
def main():
driver = webdriver.Chrome()
driver.get('https://v.taobao.com/v/content/live?catetype=704&from=taonvlang')
elem = driver.find_element_by_css_selector('input[placeholder="输入关键词搜索"]')
elem.send_keys('美女')
elem.send_keys(Keys.ENTER)
soup = BeautifulSoup(driver.page_source, 'lxml')
for img_tag in soup.select('img[src]'):
print(img_tag.attrs['src'])
if __name__ == '__main__':
main()
selenium滚屏
from time import sleep
from selenium import webdriver
def main():
driver = webdriver.Chrome()
driver.get('http://www.jd.com/')
driver.execute_script('document.documentElement.scrollTop = 10000')
while True:
sleep(1)
if __name__ == '__main__':
main()
图片验证码识别
知识点:PIL-pytesseract
from io import BytesIO
import requests
from PIL import Image
from pytesseract import image_to_string
def main():
resp = requests.get('http://www.yundama.com/index/captcha?r=0.018109785648503074')
img1 = Image.open(BytesIO(resp.content))
img1.save('yanzhengma1.jpg')
img2 = Image.open(open('yanzhengma1.jpg', 'rb'))
img3 = img2.point(lambda x: 0 if x < 128 else 255)
img3.save(open('yanzhengma100.jpg', 'wb'))
print(image_to_string(img3))
if __name__ == '__main__':
main()