上代码
多线程爬取
import requests
import pymysql
from pyecharts.charts import Bar
import csv
from bs4 import BeautifulSoup
from queue import Queue
import threading
import time
gLock = threading.Lock()
class Producer(threading.Thread):
def __init__(self,page_queue,temp_queue,*args,**kwargs):
super(Producer, self).__init__(*args, **kwargs)
self.page_queue = page_queue
self.temp_queue = temp_queue
def run(self):
while True:
if self.page_queue.empty():
break
url = self.page_queue.get()
# time.sleep(0.5)
self.parse_page(url)
def parse_page(self,url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3722.400 QQBrowser/10.5.3751.400'
}
text = requests.get(url, headers=headers).content.decode('utf-8')
htmlElemts = BeautifulSoup(text, 'html5lib')
div = htmlElemts.find('div', class_="conMidtab")
tables = div.find_all('table')
for table in tables:
trs = table.find_all('tr')[2:]
for index, tr in enumerate(trs):
tds = tr.find_all('td')
city_td = tds[0]
if index == 0:
city_td = tds[1]
cities = list(city_td.stripped_strings)[0]
min_temp_id = tds[-2]
min_temp = list(min_temp_id.stripped_strings)[0]
max_temp_id = tds[-5]
max_temp = list(max_temp_id.stripped_strings)[0]
weather_id = tds[-4]
weather = list(weather_id.stripped_strings)[0]
self.temp_queue.put({'城市':cities,"天气":weather,"最高气温":max_temp,"最低气温":min_temp})
# print(cities)
class Consumer(threading.Thread):
def __init__(self,page_queue,temp_queue,*args,**kwargs):
super(Consumer, self).__init__(*args, **kwargs)
self.page_queue = page_queue
self.temp_queue = temp_queue
def run(self):
while True:
# citry = citrys['城市']
# weather = citrys['天气']
# max_temp = citrys['最高气温']
# min_temp = citrys['最低气温']
gLock.acquire()
if self.temp_queue.empty() and self.page_queue.empty():
gLock.release()
break
self.write_csv(self.temp_queue.get())
gLock.release()
def write_csv(self,cities):
city = [cities]
print(city)
headers = ['城市','天气','最高气温','最低气温']
with open('城市天气预报.csv', 'a', newline='') as fp:
writer = csv.DictWriter(fp, headers)
# writer.writeheader()
writer.writerows(city)
def main():
page_queue = Queue(100)
temp_queue = Queue(1000)
urls = [
'http://www.weather.com.cn/textFC/hb.shtml',
'http://www.weather.com.cn/textFC/db.shtml',
'http://www.weather.com.cn/textFC/hd.shtml',
'http://www.weather.com.cn/textFC/hz.shtml',
'http://www.weather.com.cn/textFC/hn.shtml',
'http://www.weather.com.cn/textFC/xb.shtml',
'http://www.weather.com.cn/textFC/xn.shtml',
'http://www.weather.com.cn/textFC/gat.shtml'
]
for url in urls:
page_queue.put(url)
print(url)
for x in range(5):
t = Producer(page_queue,temp_queue)
t.start()
for x in range(5):
t = Consumer(page_queue,temp_queue)
t.start()
if __name__ == '__main__':
main()
单线程爬取
import requests
import os
from lxml import html
import re
from urllib import request
import time
etree = html.etree
def spider(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3722.400 QQBrowser/10.5.3751.400'
}
text = requests.get(url,headers=headers).text
htmlElements = etree.HTML(text)
imgs_urls = htmlElements.xpath('//div[@class="random_article"]//img[@class!="gif"]')
for img in imgs_urls:
img_url = img.get('data-original')
alt = img.xpath('./@alt')[0]
alt = re.sub(r"[\??\.。,!::]","",alt)
auffix = os.path.splitext(img_url)[1]
filename = alt+auffix
request.urlretrieve(img_url,'E:/imgs/'+filename)
print(filename)
time.sleep(0.5)
# break
def main():
for num in range(1,2):
url = 'http://www.doutula.com/article/list/?page=%d' %num
spider(url)
# break
if __name__ == '__main__':
main()
# request.urlretrieve("https://v.qq.com/x/cover/8l8dpqzvij9kza3.html", 'E:/imgs/视频.html')