爬虫

最新推荐文章于 2024-03-21 08:48:17 发布

菜鸟阿都

最新推荐文章于 2024-03-21 08:48:17 发布

阅读量634

点赞数 1

分类专栏：爬虫文章标签： spider

本文链接：https://blog.csdn.net/douzhenwen/article/details/80199549

版权

爬虫专栏收录该内容

5 篇文章 6 订阅

订阅专栏

 
  #-*- coding:utf-8 -*- 
 
  import urllib, urllib2 
 
  import time 
 
  from fake_useragent 
  import UserAgent 
 
  import requests 
 
  import threading 
 
  from bs4 
  import BeautifulSoup 
 
  def 
  get_page_source( 
  url): 
 
   headers = { 
  'User-Agent': UserAgent().random} 
 
   req = urllib2.Request(url, 
  None, 
  headers=headers) 
 
   response = urllib2.urlopen(req) 
 
   page_source = response.read() 
 
  return page_source 
 
   count= 
  0 
 
   filename= 
  'IpPool.txt' 
 
  for i 
  in 
  range( 
  1, 
  3): 
 
   url= 
  "http://www.xicidaili.com/nn/ 
  %s 
  "%i 
 
   html=get_page_source(url) 
 
   soup=BeautifulSoup(html, 
  'html5lib') 
 
  list = [] 
 
  for idx, tr 
  in 
  enumerate(soup.find_all( 
  'tr')): 
 
  if idx != 
  0: 
 
   tds = tr.find_all( 
  'td') 
 
   ip=tds[ 
  1].contents[ 
  0] 
 
   port=tds[ 
  2].contents[ 
  0] 
 
   ipt=ip+ 
  ':'+port 
 
  list.append(ipt) 
 
   lock=threading.Lock() 
  #建立一个锁 
 
  def 
  test( 
  i): 
 
  global lock,count 
 
   lock.acquire() 
 
   count=count+ 
  1 
 
   proxy={ 
  'http':i} 
 
   url= 
  "https://www.baidu.com" 
 
   resp=requests.get(url, 
  proxies=proxy) 
 
  if resp.status_code== 
  200: 
 
  # print(i), #python2不换行 
 
  with 
  open(filename, 
  'a') 
  as f: 
 
   f.write(i+ 
  ' 
  \n 
  ') 
 
   f.close() 
 
  print( 
  " 
  %s 
  :  
  %s 
    
  %s 
  "%(count,i,resp.status_code)) 
 
   lock.release() 
 
   threads=[] 
 
  for i 
  in 
  list: 
 
   thread=threading.Thread( 
  target=test, 
  args=(i,)) 
 
   threads.append(thread) 
 
   thread.start() 
 
  #阻塞主进程，等待所有子线程结束 
 
  for thread 
  in threads: 
 
   thread.join()

 
  import requests 
 
  from fake_useragent 
  import UserAgent 
 
   ua = UserAgent() 
 
   headers = { 
  'User-Agent': ua.random} 
 
   url = 
  'http://www.xicidaili.com/nn/1' 
 
   resp = requests.get(url, 
  headers=headers) 
 
  print(resp.text) 
 
  print(resp.encoding) 
  #获得编码 
 
  print(resp.text.encode( 
  'utf-8')) 
  #去编码 
 
  print(resp.status_code) 
  #返回表示码

菜鸟阿都

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
1
评论
爬虫

#-*- coding:utf-8 -*-import urllib, urllib2import timefrom fake_useragent import UserAgentimport requestsimport threadingfrom bs4 import BeautifulSoupdef get_page_source(url): headers = {'User-Agen...
复制链接

扫一扫