python代理池好难啊_python3代理池的搭建(一)

from pyquery import PyQuery as pq

import requests

headers={

'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'

}

def get_page(url):

html=requests.get(url,headers=headers).text

return html

class Crawler(object, metaclass=ProxyMetaclass):

def get_proxies(self,callback):

proxies=[]

#eval用来执行字符串表达式,并返回表达式的值

for proxy in eval("self.{}()".format(callback)):

print('成功获取到代理',proxy)

proxies.append(proxy)

return proxies

def crawl_daili66(self,page_count=3):

start_url='http://www.66ip.cn/{}.html'

urls=[start_url.format(page) for page in range(page_count,page_count+4)]

for url in urls:

print('Crawling',url)

html = get_page(url)

if html:

doc=pq(html)

# main > div > div:nth-child(1) > table > tbody > tr:nth-child(2)

#gt选取索引大于0的节点

trs=doc('.containerbox table tr:gt(0)').items()

for tr in trs:

# main > div > div:nth-child(1) > table > tbody > tr:nth-child(2) > td:nth-child(1)

ip = tr.find('td:nth-child(1)').text()

# main > div > div:nth-child(1) > table > tbody > tr:nth-child(2) > td:nth-child(2)

port=tr.find('td:nth-child(2)').text()

yield ':'.join([ip,port])

def crawl_goubanjia(self):

start_url='http://www.goubanjia.com/'

html=get_page(start_url)

print('Crawling', start_url)

if html:

doc=pq(html)

tds = doc('td.ip').items()

for td in tds:

#print(tr.text())

td.find('p').remove()

#print(td.text())

yield td.text().replace('\n', '')

def crawl_kuaidaili(self,page_count=1):

start_url='https://www.kuaidaili.com/free/inha/{}/'

urls = [start_url.format(page) for page in range(1, page_count + 1)]

for url in urls:

html=get_page(url)

print('Crawling', url)

if html:

doc=pq(html)

#.items()必须要有,否则trs只是一个element的集合,里面的元素都是str

trs=doc('#list table tbody tr').items()

for tr in trs:

# list > table > tbody > tr:nth-child(1) > td:nth-child(1)

ip = tr.find('td:nth-child(1)').text()

port=tr.find('td:nth-child(2)').text()

yield ':'.join([ip, port])

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值