#-*- coding:utf-8 -*-
import urllib, urllib2
import time
from fake_useragent
import UserAgent
import requests
import threading
from bs4
import BeautifulSoup
def
get_page_source(
url):
headers = {
'User-Agent': UserAgent().random}
req = urllib2.Request(url,
None,
headers=headers)
response = urllib2.urlopen(req)
page_source = response.read()
return page_source
count=
0
filename=
'IpPool.txt'
for i
in
range(
1,
3):
url=
"http://www.xicidaili.com/nn/
%s
"%i
html=get_page_source(url)
soup=BeautifulSoup(html,
'html5lib')
list = []
for idx, tr
in
enumerate(soup.find_all(
'tr')):
if idx !=
0:
tds = tr.find_all(
'td')
ip=tds[
1].contents[
0]
port=tds[
2].contents[
0]
ipt=ip+
':'+port
list.append(ipt)
lock=threading.Lock()
#建立一个锁
def
test(
i):
global lock,count
lock.acquire()
count=count+
1
proxy={
'http':i}
url=
"https://www.baidu.com"
resp=requests.get(url,
proxies=proxy)
if resp.status_code==
200:
# print(i), #python2不换行
with
open(filename,
'a')
as f:
f.write(i+
'
\n
')
f.close()
print(
"
%s
:
%s
%s
"%(count,i,resp.status_code))
lock.release()
threads=[]
for i
in
list:
thread=threading.Thread(
target=test,
args=(i,))
threads.append(thread)
thread.start()
#阻塞主进程,等待所有子线程结束
for thread
in threads:
thread.join()
import requests
from fake_useragent
import UserAgent
ua = UserAgent()
headers = {
'User-Agent': ua.random}
url =
'http://www.xicidaili.com/nn/1'
resp = requests.get(url,
headers=headers)
print(resp.text)
print(resp.encoding)
#获得编码
print(resp.text.encode(
'utf-8'))
#去编码
print(resp.status_code)
#返回表示码