爬虫
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author:Echean
# datetime:2018/8/8 21:22
# software: PyCharm
import json
import sys
import time
import requests
import re
import random
from scrapy import Selector
from mysql.mysqldb import MyPymysqlPool
class GetIP(object):
IPS = []
url_list = ['http://www.xicidaili.com/nn/',
'http://www.xicidaili.com/nn/2',
'http://www.xicidaili.com/nn/3', ]
UA_list = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36"
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36"
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER"
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36"
]
def _get_random_header(self):
headers = {
'User-Agent': random.choice(self.UA_list),
'Upgrade-Insecure-Requests': '1',
}
return headers
def _get_ip_list(self):
for url in self.url_list:
res = requests.get(url=url, headers=self._get_random_header())
sel = Selector(response=res)
ip_list = sel.css('#ip_list tr')