python多线程爬取网站
完整代码:
import requests
from bs4 import BeautifulSoup
import threading
import time
#lock = threading.Lock()
#设置最大线程数
lock = threading.Semaphore(20)
headers = {
"User-Agent":'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
'Cookie': 'uuid=8c22c9ec-f40e-49b3-a544-ea0f34f2a603; ganji_uuid=7676341335710599796254; cityDomain=gz; antipas=716477E083567463193N73u8; '
'clueSourceCode=*#00; user_city_id=16; sessionid=4f9c9a87-56bc-47da-9176-f9124bc4c529; cainfo={"ca_a":"-","ca_b%2'
'2:"-","ca_s":"seo_google","ca_n":"default","ca_medium":"-","ca_term":"-","'
'ca_content":"-","ca_'
'campaign":"-","ca_'
'kw":"-","ca_i":"-","scode":"-","keyword":"-","ca_keywordid":"-","display_fin'
'ance_flag":"-","platform"%'
'3A"1","version":1,"client_ab":"-","guid":"8c22c9ec-f40e-49b3-a544-ea0f34f2a603","ca_city"%'
'3A"km","sessionid":"4f9c9a87-56bc-47da-9176-f9124bc4c529"}; Hm_lvt_936a6d5df3f3d309bda39e92da3dd52f=1580653134,1580653'
'170,1580887044; close_finance_popup=2020-02-05; preTime={"last":1580887241,"this":1580653133,"pre":1580653133}; '
'Hm_lpvt_936a6d5df3f3d309bda39e92da3dd52f=1580887242'
}
#发送请求,返回网友内容
def getHTMLText(url):
try:
html = requests.get(url,headers=headers).content
html = html.decode('utf-8')
return html
except:
return '产生异常'
#数据提取
def get_deta(html):
#解析
soup = BeautifulSoup(html,'html.parser')
infos = soup.find('ul',{'class':'carlist clearfix js-top'}).find_all('li')
#print(infos)
with open('/Users/baby/Desktop/爬取二手车/guazi.csv','a') as f:
pic_urls = []
for info in infos:
leixing = info.find('h2').get_text()
#print(leixing)
nianfen1 = info.find('div',{'class':'t-i'}).get_text()
#print(nianfen)
nianfen = nianfen1[0]
licheng = nianfen1[1]
didian = '广州'
shoujia = info.find('div',{'class':'t-price'}).find('p').get_text()
try:
yuanjia = info.find('div',{'class':'t-price'}).find('em').get_text()
except ArithmeticError:
yuanjia = ''
tupian = info.find('a').find('img')['src']
pic_urls.append(tupian)
f.write('{},{},{},{},{},{}\n'.format(leixing,nianfen,licheng,didian,shoujia,yuanjia))
return pic_urls
#下载图片的函数
def download_pics(url,n):
r = requests.get(url)
with open('/Users/baby/Desktop/爬取二手车/图片/{}.jpg'.format(n),'wb')as f:
f.write(r.content)
#下载完毕,解锁
lock.release()
def main():
start = time.perf_counter()
n = 0
for i in range(1,51):
start_url = 'https://www.guazi.com/gz/buy/o' str(i) '/#bread'
html = getHTMLText(start_url)
#print(html)
pic_urls = get_deta(html)
#lock = threading.Semaphore(100)
for url in pic_urls:
n = 1
print('正在下载第{}张图片'.format(n))
#上锁
lock.acquire()
t = threading.Thread(target=download_pics,args=(url,n))
t.start()
end = time.perf_counter()
print('所用时间:',end-start)
if __name__ == '__main__':
main()