python IP代理爬虫，download 代理IP

最新推荐文章于 2024-05-02 01:13:24 发布

y912493785

最新推荐文章于 2024-05-02 01:13:24 发布

阅读量2.2k

点赞数

分类专栏： Python 应用文章标签： Python 爬虫代理IP 正则表达式 Python 爬虫 URL

本文链接：https://blog.csdn.net/y912493785/article/details/9952637

版权

Python 应用专栏收录该内容

1 篇文章 0 订阅

订阅专栏

本人小白，自己写来试试的，打算以后做个自动切换HTTP代理的Python程序那，第一次写博客，大家多包涵哈。

#-*- coding: utf-8 -*-
import re 
import urllib2
import sys
import time
import os

def search_url_1(html,type):
    
	ruel= re.compile(r'''href="\S+" title="\S+''')
	temp_url = ''
	for m in ruel.finditer(html):
	
		temp_url+=m.group()
		#print m.group().decode("UTF-8").encode(type)
	#print temp_url.decode("UTF-8").encode(type)
	return temp_url
def search_url_2(html_1,type):
	html = search_url_1(html_1,type)
	ruel = re.compile(r'''href="\S+" title="\S+免费国外网页http代理ip地址''')
	temp_url = ruel.search(html)
	temp = temp_url.group()
	ruel_1 = re.compile(r'''http://\S+.html''')
	temp_url_1 = ruel_1.search(temp)
	print temp_url_1.group()
	return temp_url_1.group()
def save_IP():
	_Path = os.getcwd()    	
	Path = _Path + "\\ip.txt"
	f = open(Path,'w')
	f.close()
def _main(url):
	headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
	req = urllib2.Request(url, headers=headers)
	html = urllib2.urlopen(req).read()   # UTF-8
	type = sys.getfilesystemencoding()      # local encode format
	print "正在下载代理IP，请稍后。。".decode("UTF-8").encode(type)

	Serch_ = re.compile(r'''<div class="cont_font">[\s\S]+''')
	result = Serch_.search(html)
	html = result.group()
	
	
	Serch_rule = re.compile(r'''\d{0,3}.\d{0,3}.\d{0,3}.\d{0,3}:(\d){2,4}''')
	#Serch_result = Serch_rule.findall(html)

	save_IP()
	for m in Serch_rule.finditer(html):
		#print m.group() + '\n'
		_Path = os.getcwd()    	
		Path = _Path + "\\ip.txt"
		_f = open(Path,'a+')
		_f.write(m.group() + '\n')
		_f.close()
	print "IP下载存储成功".decode("UTF-8").encode(type)	
	
	
########################################################
print u'---------准备抓取目标网站--------'
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req = urllib2.Request("http://www.youdaili.cn/", headers=headers)
html = urllib2.urlopen(req).read()   # UTF-8
type = sys.getfilesystemencoding()  
print u'正在获取公布最新代理ip网址：'
url =  search_url_2(html,type)
_main(url)

if input() == "\n":
	self.enable = False
########################################################

虽然是小白，然后注释也木有写，但是还是奢望如果有转载的注明出处哈，谢喽。

y912493785

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
python IP代理爬虫，download 代理IP

本人小白，自己写来试试的，打算以后做个自动切换HTTP代理的Python程序那，第一次写博客，大家多包涵哈。#-*- coding: utf-8 -*-import re import urllib2import sysimport timeimport osdef search_url_1(html,type): ruel= re.compile(r'''href="\
复制链接

扫一扫

专栏目录