由于项目需要先练习下多线程爬图片

主要是  利用beautifulsoup requests 与Queue.Queue来爬

#! /usr/bin/env python
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup

import urllib,os
import threading
import Queue
urllist=[]
g_queue = Queue.Queue()
threads = []


def doloadimg(url,urltitle):
path = os.path.join('images', urltitle)
urllib.urlretrieve(url,filename=path)
def geturl(queue_l):
while True:
try:

url = queue_l.get(block=True)
queue_l.task_done()
respose = requests.get(url)
content = respose.content
soup = BeautifulSoup(content, 'lxml')
s_list = soup.findChildren('a', attrs={'class': 'col-sm-3'})
for tag in s_list:
print threading.current_thread().getName(), tag.name, tag.img['src'], tag.img['title']
# doloadimg(tag.img['src'],tag.img['src'].split('/').pop())
doloadimg(tag.img['src'], tag.img['title'] + '.jpg')

except Exception, e:
print str(e)
continue
# for url in urllist :
# try:
# respose = requests.get(url)
# content = respose.content
# soup = BeautifulSoup(content,'lxml')
# s_list = soup.findChildren('a',attrs={'class':'col-sm-3'})
# for tag in s_list :
# print tag.name ,tag.img['src'],tag.img['title']
# #doloadimg(tag.img['src'],tag.img['src'].split('/').pop())
# doloadimg(tag.img['src'], tag.img['title']+'.jpg')
# except Exception,e:
# print str(e)
# continue
def eseabd():
print threading.current_thread()
def t_do():
for i in range(20):
try:

#t=threading.Thread(target=geturl)
t = threading.Thread(target=geturl,args=(g_queue,))
t.setName("thread"+str(i))
t.setDaemon(True)
t.start()
threads.append(t)
except Exception,e:
print str(e)

if __name__ == '__main__':

for i in range(1000):
#print "cccccc"
urllist.append('http://www.adoutu.com/picture/list/' + str(i + 1))
g_queue.put('http://www.adoutu.com/picture/list/' + str(i + 1))
print g_queue.qsize()
#print g_queue.qsize()
t_do()
for t in threads:
t.join()
print "Exiting Main Thread"






#! /usr/bin/env python
# -*- coding: utf-8 -*-
import json

import requests
import html5lib
from bs4 import BeautifulSoup

all_data = []
def deal_url(url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0',

}
respose = requests.get(url,headers=headers)
content = respose.content.decode('utf-8')
soup = BeautifulSoup(content,'lxml')
if url.split('/').pop() == 'gat.shtml':
soup = BeautifulSoup(content, 'html5lib')
conMidtab = soup.find('div',attrs={'class':'conMidtab'})

tables = conMidtab.find_all('table')

for table in tables:

trs = table.find_all('tr')[2:]
for index,tr in enumerate(trs):
tds = tr.find_all('td')
city_td = tds[0]
if index == 0:
city_td = tds[1]
wendu_td = tds[-2]

city_td_name = list(city_td.stripped_strings)[0]
min_temp = list(wendu_td.stripped_strings)[0]

#print ({'city':city_td_name,'min_temp':min_temp})
#print "***"*30
#print json.dumps({'city':city_td_name,'min_temp':min_temp},encoding='UTF-8',ensure_ascii=False)
all_data.append({'city':city_td_name,'min_temp':int(min_temp)})
#



def main():
urllist = ['http://www.weather.com.cn/textFC/hb.shtml',
'http://www.weather.com.cn/textFC/db.shtml',
'http://www.weather.com.cn/textFC/hd.shtml',
'http://www.weather.com.cn/textFC/hz.shtml',
'http://www.weather.com.cn/textFC/hn.shtml',
'http://www.weather.com.cn/textFC/xb.shtml',
'http://www.weather.com.cn/textFC/xn.shtml',
'http://www.weather.com.cn/textFC/gat.shtml',
]
for url in urllist:
deal_url(url)
all_data.sort(key=lambda data:data['min_temp'])
print json.dumps(all_data,encoding='UTF-8',ensure_ascii=False)
#[{"city": "阿里", "min_temp": 3}, {"city": "果洛", "min_temp": 5}]
cities = list(map(lambda x:x['city'],all_data))
temps = list(map(lambda x:x['min_temp'],all_data))
print json.dumps(cities,encoding='utf-8',ensure_ascii=False)
print json.dumps(temps, encoding='utf-8', ensure_ascii=False)

if __name__ == '__main__':
main()
 

转载于:https://www.cnblogs.com/xiaocry/articles/11378000.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值