from pyquery import PyQuery as pq
import requests
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
def get_page(url):
html=requests.get(url,headers=headers).text
return html
class Crawler(object, metaclass=ProxyMetaclass):
def get_proxies(self,callback):
proxies=[]
#eval用来执行字符串表达式,并返回表达式的值
for proxy in eval("self.{}()".format(callback)):
print('成功获取到代理',proxy)
proxies.append(proxy)
return proxies
def crawl_daili66(self,page_count=3):
start_url='http://www.66ip.cn/{}.html'
urls=[start_url.format(page) for page in range(page_count,page_count+4)]
for url in urls:
print('Crawling',url)
html = get_page(url)
if html:
doc=pq(html)
# main > div > div:nth-child(1) > table > tbody > tr:nth-child(2)
#gt选取索引大于0的节点
trs=doc('.containerbox table tr:gt(0)').items()
for tr in trs:
# main > div > div:nth-child(1) > table > tbody > tr:nth-child(2) > td:nth-child(1)
ip = tr.find('td:nth-child(1)').text()
# main > div > div:nth-child(1) > table > tbody > tr:nth-child(2) > td:nth-child(2)
port=tr.find('td:nth-child(2)').text()
yield ':'.join([ip,port])
def crawl_goubanjia(self):
start_url='http://www.goubanjia.com/'
html=get_page(start_url)
print('Crawling', start_url)
if html:
doc=pq(html)
tds = doc('td.ip').items()
for td in tds:
#print(tr.text())
td.find('p').remove()
#print(td.text())
yield td.text().replace('\n', '')
def crawl_kuaidaili(self,page_count=1):
start_url='https://www.kuaidaili.com/free/inha/{}/'
urls = [start_url.format(page) for page in range(1, page_count + 1)]
for url in urls:
html=get_page(url)
print('Crawling', url)
if html:
doc=pq(html)
#.items()必须要有,否则trs只是一个element的集合,里面的元素都是str
trs=doc('#list table tbody tr').items()
for tr in trs:
# list > table > tbody > tr:nth-child(1) > td:nth-child(1)
ip = tr.find('td:nth-child(1)').text()
port=tr.find('td:nth-child(2)').text()
yield ':'.join([ip, port])