判断代理是否可用的核心在于requests
的proxies
与timeout
两个参数。
from pyquery import PyQuery as pq
import requests
headers = {'User_Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/79.0.3945.117 Safari/537.36',
'Cookie': 'channelid=0; sid=1579167120733110; _ga=GA1.2.617590803.1579168440; '
'_gid=GA1.2.1767454781.1579168440; Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1579168440; '
'Hm_lpvt_7ed65b1cc4b810e9fd37959c9bb51b31=1579168440'}
proxy = {}
availables = []
not_availables = []
def get_proxies(url):
html = requests.get(url, headers=headers)
if html.status_code == 200:
doc = pq(html.text)
items = doc('table tbody tr').items()
for item in items:
proxy['http'] = '{}://{}:{}'.format(item.find('td[data-title=类型]').text().lower(),
item.find('td[data-title=IP]').text(),
item.find('td[data-title=PORT]').text())
proxy['https'] = '{}://{}:{}'.format(item.find('td[data-title=类型]').text().lower(),
item.find('td[data-title=IP]').text(),
item.find('td[data-title=PORT]').text())
is_available()
def is_available():
try:
requests.get(url='https://www.kuaidaili.com/free/inha/1/', headers=headers, proxies=proxy, timeout=3)
availables.append(proxy.copy())
except requests.RequestException:
not_availables.append(proxy.copy())
if __name__ == '__main__':
pages = 100
for page in range(pages):
print('正在爬取第{}页'.format(page + 1))
url = 'https://www.kuaidaili.com/free/inha/{}/'.format(page + 1)
get_proxies(url)
with open('可用代理.txt', 'w') as f:
for available in availables:
f.write(available['http'])
f.write('\n')
with open('不可用代理.txt', 'w') as f:
for not_available in not_availables:
f.write(not_available['http'])
f.write('\n')