import
requests
from
lxml
import
etree
from
selenium
import
webdriver
from
selenium.webdriver.chrome.options
import
Options
import
time
import
re
from
multiprocessing.dummy
import
Pool
"""
爬取http://www.goubanjia.com/ ip代理网站
此网站的反爬机制有在显示ip的标签中伪造了dispaly:none的误导信息,使用了js来更改端口号
采取的破解策略为使用selenium无头浏览器,然后使用xpath解析过滤掉误导信息
"""
chrome_options
=
Options()
chrome_options.add_argument(
'--headless'
)
chrome_options.add_argument(
'--disable-gpu'
)
browser
=
webdriver.Chrome(chrome_options
=
chrome_options)
# 上网
url
=
'http://www.goubanjia.com'
browser.get(url)
# time.sleep(3)
page_text
=
browser.page_source
tree
=
etree.HTML(page_text)
browser.quit()
pool
=
Pool(
10
)
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
,
}
ip_list
=
[]
right_list
=
[]
# parser = etree.HTMLParser(encoding="utf-8")
# tree = etree.parse("ip.html", parser=parser) # 将html文档或者xml文档转换成一个etree对象
# tree = etree.HTML(page_text) # 读取字符串
tree_list
=
tree.xpath(
'//*[@id="services"]/div/div[2]/div/div/div/table//tr'
)
for
i
in
tree_list[
1
:]:
ip
=
"
".join(i.xpath('./td[1]//*[@style!="
display: none"]
/
text() | .
/
td[
1
]
/
text() | .
/
td[
1
]
/
span[last()]
/
text()'))
level
=
"".join(i.xpath(
'./td[2]//text()'
))
type
=
"".join(i.xpath(
'./td[3]//text()'
))
address
=
"
".join(i.xpath('./td[4]/a//text()')).replace("
", "
")
ip_list.append({
"ip"
: ip,
"level"
: level,
"type"
:
type
,
"address"
: address})
print
(ip_list)
# 使用线程池爬取
def
test_ip(dic):
test_url
=
'http://www.baidu.com/s?ie=UTF-8&wd=ip'
try
:
response
=
requests.get(test_url, headers
=
headers, proxies
=
{dic[
"type"
]: dic[
"ip"
]})
tree
=
etree.HTML(response.text)
li
=
tree.xpath(
'//div[@id="1"]/div[1]/div[1]/div[2]/table//tr/td//text()'
)
ip
=
"".join(li).replace(
' '
, '')
if
re.findall(
'[\d\.]+'
, ip)[
0
]
=
=
dic[
"ip"
].split(
":"
)[
0
]:
right_list.append(dic)
except
Exception as e:
print
(e)
'''
这是一开始不使用线程池爬取的,很慢,加了线程池之后还不是很快
for dic in ip_list:
test_url = 'http://www.baidu.com/s?ie=UTF-8&wd=ip'
try:
response = requests.get(test_url, headers=headers, proxies={dic["type"]: dic["ip"]})
tree = etree.HTML(response.text)
li = tree.xpath('//div[@id="1"]/div[1]/div[1]/div[2]/table//tr/td//text()')
ip = "".join(li).replace(' ', '')
if re.findall('[\d\.]+', ip)[0] == dic["ip"].split(":")[0]:
right_list.append(dic)
except Exception as e:
print(e)
'''
pool.
map
(test_ip,ip_list)
print
(right_list)