配置python3.6.5环境
git clone git://github.com/yyuu/pyenv.git .pyenv
echo 'export PYENV_ROOT="$HOME/.pyenv"' >> ~/.bashrc
echo 'export PATH="$PYENV_ROOT/bin:$PATH"' >> ~/.bashrc
echo 'eval "$(pyenv init -)"' >> ~/.bashrc
source ~/.bashrc
pyenv install 3.6.5
pyenv global 3.6.5
pyenv rehash
安装需要的库
easy_install flask
easy_install requests
easy_install lxml
easy_install celery
easy_install pymysql
easy_install flask_sqlalchemy
easy_install supervisor
#!/usr/bin/python
# coding=utf-8
import sys, requests, re, json, time
from concurrent.futures import ThreadPoolExecutor, as_completed
from requests.adapters import HTTPAdapter
from lxml import etree
class GoogleSpider:
def __init__(self, seoKeyworsList):
self.seoKeyworsList = seoKeyworsList
super(GoogleSpider, self).__init__()
'''
运行爬虫
'''
def go(self):
tasks_master = []
# Dynamic Thread Pool
print(self.seoKeyworsList)
executor_master = ThreadPoolExecutor(max_workers=len(self.seoKeyworsList))
for keyword in self.seoKeyworsList:
task_master = executor_master.submit(self.get_google, keyword)
tasks_master.append(task_master)
executor_master.shutdown()
data = []
for ta in as_completed(tasks_master):
data.append(ta.result())
return data
'''
解析并获取谷歌搜索结果页面的link
'''
def parseGoogle(self, url):
response = self.do_Request_proxy(url)
if response and response.status_code == requests.codes.ok:
html = etree.HTML(response.content)
links = html.xpath('//*[@id="rso"]/div/div/div//div/div/div[1]/a[1]/@href')
return links
else:
return []
'''
翻页并且调用使用抓取结果深入抓取
'''
def get_google(self, keyword):
# get 3 pages data
google_links = [
'https://www.google.com/search?&q=' + keyword,
'https://www.google.co.jp/search?&q=' + keyword+ '&start=10',
'https://www.google.co.jp/search?&q=' + keyword+ '&start=20'
]
task_google_links = []
#线程池
threadpool = ThreadPoolExecutor(max_workers=len(google_links))
for link in google_links:
task_google_links.append(threadpool.submit(self.parseGoogle, link))
links = []
for t in as_completed(task_google_links):
if t.result() != []:
links += t.result()
data = []
if links != []:
links = links[:20]
for link in links:
page_source = self.get_other_page(link)
if page_source is not None:
data.append(page_source)
suggestion = ''
return data
'''
提取网站的数据
'''
def get_other_page(self, link):
# match domain
matches = re.search(r'^http(s)?:\/\/(.*?)\/', link)
print(matches)
if matches is not None and '.pdf' not in link:
if matches.group(1) is not None:
protocol = 'https://'
else:
protocol = 'http://'
host = protocol + matches.group(2)
response = self.do_Request(link)
if response and response.status_code == requests.codes.ok:
response.encoding = 'utf8'
response.headers['content-type']
data = {}
xml = etree.HTML(response.content)
title = xml.xpath('//title/text()')
if title != []:
data['site_url'] = link
data['title'] = title[0]
data['description'] = xml.xpath('//meta[@name="description"]/@content')
data['keywords'] = xml.xpath('//meta[@name="keywords"]/@content')
data['atags'] = []
atags = xml.xpath('//*/a/@href')
for tag in atags:
if "#" in tag or 'javascript' in tag or '.pdf' in tag or './' in tag:
del tag
else:
if re.search(r'^http(s)?:\/\/(.*?)\/', tag) is None:
tag = host + tag
data['atags'].append(tag)
return data
else:
return []
'''
使用代理请求 ip池 项目地址 https://github.com/jhao104/proxy_pool
'''
def do_Request_proxy(self, url):
s = requests.Session()
s.mount('http://', HTTPAdapter(max_retries=3))
s.mount('https://', HTTPAdapter(max_retries=3))
retry_count = 10
proxy = self.get_proxy()
proxy = re.sub(r"b|'", '', format(proxy))
while retry_count > 0:
try:
html = requests.get(url, proxies={"http": "http://{}".format(proxy), "https" : "http://{}".format(proxy)})
return html
except Exception:
time.sleep(5)
retry_count -= 1
self.delete_proxy(proxy)
return False
'''
不使用代理请求
'''
def do_Request(self, url):
s = requests.Session()
s.mount('http://', HTTPAdapter(max_retries=3))
s.mount('https://', HTTPAdapter(max_retries=3))
retry_count = 5
# proxies = {
# "http": 'http://192.168.0.106:1080',
# "https": 'http://192.168.0.106:1080'
# }
while retry_count > 0:
try:
result = s.get(url, headers={"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36"})
s.close()
return result
except:
retry_count -= 1
return False
'''
从ip池中获取ip
'''
def get_proxy(self):
return requests.get("http://127.0.0.1:5010/get/").content
'''
删除失效的ip
'''
def delete_proxy(self, proxy):
requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))
#调用
if __name__ == '__main__':
keywords = ['python', 'php', java']
google = GoogleSpider(keywords)