由于项目需要先练习下多线程爬图片

最新推荐文章于 2024-07-25 18:55:58 发布

weixin_30338481

最新推荐文章于 2024-07-25 18:55:58 发布

阅读量57

点赞数

文章标签： python json

原文链接：http://www.cnblogs.com/xiaocry/articles/11378000.html

版权

主要是利用beautifulsoup requests 与Queue.Queue来爬

#! /usr/bin/env python
# -*- coding: utf-8 -*-
import requests
from  bs4  import BeautifulSoup

import urllib,os
import threading
import Queue
urllist=[]
g_queue = Queue.Queue()
threads = []


def doloadimg(url,urltitle):
   path = os.path.join('images', urltitle)
   urllib.urlretrieve(url,filename=path)
def geturl(queue_l):
   while True:
      try:

         url = queue_l.get(block=True)
         queue_l.task_done()
         respose = requests.get(url)
         content = respose.content
         soup = BeautifulSoup(content, 'lxml')
         s_list = soup.findChildren('a', attrs={'class': 'col-sm-3'})
         for tag in s_list:
            print threading.current_thread().getName(), tag.name, tag.img['src'], tag.img['title']
            # doloadimg(tag.img['src'],tag.img['src'].split('/').pop())
            doloadimg(tag.img['src'], tag.img['title'] + '.jpg')

      except Exception, e:
         print str(e)
         continue
   # for url in urllist :
   #  try:
   #     respose = requests.get(url)
   #     content = respose.content
   #     soup = BeautifulSoup(content,'lxml')
   #     s_list = soup.findChildren('a',attrs={'class':'col-sm-3'})
   #     for tag in s_list :
   #        print tag.name ,tag.img['src'],tag.img['title']
   #        #doloadimg(tag.img['src'],tag.img['src'].split('/').pop())
   #        doloadimg(tag.img['src'], tag.img['title']+'.jpg')
   #  except Exception,e:
   #     print str(e)
   #     continue
def eseabd():
   print  threading.current_thread()
def t_do():
   for i in range(20):
      try:

         #t=threading.Thread(target=geturl)
         t = threading.Thread(target=geturl,args=(g_queue,))
         t.setName("thread"+str(i))
         t.setDaemon(True)
         t.start()
         threads.append(t)
      except Exception,e:
         print str(e)

if __name__ == '__main__':

   for i in range(1000):
      #print "cccccc"
      urllist.append('http://www.adoutu.com/picture/list/' + str(i + 1))
      g_queue.put('http://www.adoutu.com/picture/list/' + str(i + 1))
      print g_queue.qsize()
      #print g_queue.qsize()
   t_do()
   for t in threads:
      t.join()
   print "Exiting Main Thread"

#! /usr/bin/env python
# -*- coding: utf-8 -*-
import json

import requests
import html5lib
from bs4 import BeautifulSoup

all_data = []
def deal_url(url):
   headers = {
         'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0',

   }
   respose = requests.get(url,headers=headers)
   content = respose.content.decode('utf-8')
   soup = BeautifulSoup(content,'lxml')
   if url.split('/').pop() == 'gat.shtml':
      soup = BeautifulSoup(content, 'html5lib')
   conMidtab = soup.find('div',attrs={'class':'conMidtab'})

   tables = conMidtab.find_all('table')

   for table in tables:

      trs = table.find_all('tr')[2:]
      for index,tr in enumerate(trs):
         tds = tr.find_all('td')
         city_td = tds[0]
         if index == 0:
            city_td = tds[1]
         wendu_td = tds[-2]

         city_td_name = list(city_td.stripped_strings)[0]
         min_temp = list(wendu_td.stripped_strings)[0]

         #print ({'city':city_td_name,'min_temp':min_temp})
         #print "***"*30
         #print json.dumps({'city':city_td_name,'min_temp':min_temp},encoding='UTF-8',ensure_ascii=False)
         all_data.append({'city':city_td_name,'min_temp':int(min_temp)})
         #



def main():
   urllist = ['http://www.weather.com.cn/textFC/hb.shtml',
            'http://www.weather.com.cn/textFC/db.shtml',
            'http://www.weather.com.cn/textFC/hd.shtml',
            'http://www.weather.com.cn/textFC/hz.shtml',
            'http://www.weather.com.cn/textFC/hn.shtml',
            'http://www.weather.com.cn/textFC/xb.shtml',
            'http://www.weather.com.cn/textFC/xn.shtml',
            'http://www.weather.com.cn/textFC/gat.shtml',
         ]
   for url in urllist:
      deal_url(url)
   all_data.sort(key=lambda data:data['min_temp'])
   print json.dumps(all_data,encoding='UTF-8',ensure_ascii=False)
   #[{"city": "阿里", "min_temp": 3}, {"city": "果洛", "min_temp": 5}]
   cities = list(map(lambda x:x['city'],all_data))
   temps = list(map(lambda x:x['min_temp'],all_data))
   print json.dumps(cities,encoding='utf-8',ensure_ascii=False)
   print json.dumps(temps, encoding='utf-8', ensure_ascii=False)

if __name__ == '__main__':
   main()

转载于:https://www.cnblogs.com/xiaocry/articles/11378000.html

weixin_30338481

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
由于项目需要先练习下多线程爬图片

主要是利用beautifulsoup requests 与Queue.Queue来爬#! /usr/bin/env python# -*- coding: utf-8 -*-import requestsfrom bs4 import BeautifulSoupimport urllib,osimport threadingimport Queueurllist=[]g_qu...
复制链接

扫一扫