主要是 利用beautifulsoup requests 与Queue.Queue来爬
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import urllib,os
import threading
import Queue
urllist=[]
g_queue = Queue.Queue()
threads = []
def doloadimg(url,urltitle):
path = os.path.join('images', urltitle)
urllib.urlretrieve(url,filename=path)
def geturl(queue_l):
while True:
try:
url = queue_l.get(block=True)
queue_l.task_done()
respose = requests.get(url)
content = respose.content
soup = BeautifulSoup(content, 'lxml')
s_list = soup.findChildren('a', attrs={'class': 'col-sm-3'})
for tag in s_list:
print threading.current_thread().getName(), tag.name, tag.img['src'], tag.img['title']
# doloadimg(tag.img['src'],tag.img['src'].split('/').pop())
doloadimg(tag.img['src'], tag.img['title'] + '.jpg')
except Exception, e:
print str(e)
continue
# for url in urllist :
# try:
# respose = requests.get(url)
# content = respose.content
# soup = BeautifulSoup(content,'lxml')
# s_list = soup.findChildren('a',attrs={'class':'col-sm-3'})
# for tag in s_list :
# print tag.name ,tag.img['src'],tag.img['title']
# #doloadimg(tag.img['src'],tag.img['src'].split('/').pop())
# doloadimg(tag.img['src'], tag.img['title']+'.jpg')
# except Exception,e:
# print str(e)
# continue
def eseabd():
print threading.current_thread()
def t_do():
for i in range(20):
try:
#t=threading.Thread(target=geturl)
t = threading.Thread(target=geturl,args=(g_queue,))
t.setName("thread"+str(i))
t.setDaemon(True)
t.start()
threads.append(t)
except Exception,e:
print str(e)
if __name__ == '__main__':
for i in range(1000):
#print "cccccc"
urllist.append('http://www.adoutu.com/picture/list/' + str(i + 1))
g_queue.put('http://www.adoutu.com/picture/list/' + str(i + 1))
print g_queue.qsize()
#print g_queue.qsize()
t_do()
for t in threads:
t.join()
print "Exiting Main Thread"
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import json
import requests
import html5lib
from bs4 import BeautifulSoup
all_data = []
def deal_url(url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0',
}
respose = requests.get(url,headers=headers)
content = respose.content.decode('utf-8')
soup = BeautifulSoup(content,'lxml')
if url.split('/').pop() == 'gat.shtml':
soup = BeautifulSoup(content, 'html5lib')
conMidtab = soup.find('div',attrs={'class':'conMidtab'})
tables = conMidtab.find_all('table')
for table in tables:
trs = table.find_all('tr')[2:]
for index,tr in enumerate(trs):
tds = tr.find_all('td')
city_td = tds[0]
if index == 0:
city_td = tds[1]
wendu_td = tds[-2]
city_td_name = list(city_td.stripped_strings)[0]
min_temp = list(wendu_td.stripped_strings)[0]
#print ({'city':city_td_name,'min_temp':min_temp})
#print "***"*30
#print json.dumps({'city':city_td_name,'min_temp':min_temp},encoding='UTF-8',ensure_ascii=False)
all_data.append({'city':city_td_name,'min_temp':int(min_temp)})
#
def main():
urllist = ['http://www.weather.com.cn/textFC/hb.shtml',
'http://www.weather.com.cn/textFC/db.shtml',
'http://www.weather.com.cn/textFC/hd.shtml',
'http://www.weather.com.cn/textFC/hz.shtml',
'http://www.weather.com.cn/textFC/hn.shtml',
'http://www.weather.com.cn/textFC/xb.shtml',
'http://www.weather.com.cn/textFC/xn.shtml',
'http://www.weather.com.cn/textFC/gat.shtml',
]
for url in urllist:
deal_url(url)
all_data.sort(key=lambda data:data['min_temp'])
print json.dumps(all_data,encoding='UTF-8',ensure_ascii=False)
#[{"city": "阿里", "min_temp": 3}, {"city": "果洛", "min_temp": 5}]
cities = list(map(lambda x:x['city'],all_data))
temps = list(map(lambda x:x['min_temp'],all_data))
print json.dumps(cities,encoding='utf-8',ensure_ascii=False)
print json.dumps(temps, encoding='utf-8', ensure_ascii=False)
if __name__ == '__main__':
main()