python多线程url采集器 + github_python3 多线程 采集 xpath

#!/usr/bin/python#-*- coding: UTF-8 -*-

'''Thread3 多线程测试采集'''

importthreading,time,queue,Mongo_utils,mysqlUtils,requests,json,osfrom lxml importhtml

etree=html.etree

exitFlag=0

db=Mongo_utils.mongodb_15_27017task()

table= db["xx_anjuke_agent1"]

table_urls= db["xx_spider_urls1"]

list_pro=mysqlUtils.select_pro()

list_urls= table_urls.find().limit(2000)

insert_list=[]

del_list=[]classmyThread(threading.Thread):def __init__(self,threadId,name,q):

threading.Thread.__init__(self)

self.threadId=threadId

self.name=name

self.q=qdefrun(self):print("开始线程" +self.name)

spider(self.name,self.q)print("退出线程" +self.name)defhead():

headers={"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8","Accept-Encoding": "gzip, deflate, br","Accept-Language": "zh-CN,zh;q=0.9","cache-control": "max-age=0","upgrade-insecure-requests": "1","Connection": "keep-alive","Content-Type": "text/html; charset=UTF-8","User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",

}returnheadersdefspider(name,q):while notexitFlag:

queueLock.acquire()if notworkQueue.empty():

i=q.get()

queueLock.release()

_id= i["_id"]

city= i["city"]

zone= i["zone"]

street= i["street"]

urls= i["urls"]

headers=head()try:

url= "https://。。。。。。。。。。。" %_id#//,proxies=proxy

response_contact = requests.session().get(url=url, allow_redirects=False, headers=headers,

timeout=1)print(response_contact.status_code)if response_contact.status_code == 302:print("验证")print(url)

os._exit(0)

res=json.loads(response_contact.text)

contact= res['data']

response_dl= requests.session().get(url=urls, allow_redirects=False, headers=headers,

timeout=1)if response_dl.status_code == 302:print("验证")print(urls)

os._exit(0)if ("获取成功") not in response_contact.text or ("房屋编码") not inresponse_dl.text:print("pass")passhtml=etree.HTML(response_dl.content)

name= html.xpath("//div[@class='brokercard-name']/text()")[0].strip()

company= html.xpath("//div[@class='broker-company']/p[1]/a/text()")[0]

company_url= html.xpath("//div[@class='broker-company']/p[1]/a/@href")[0]

store= html.xpath("//div[@class='broker-company']/p[2]/span/text()")[0]#re = name, company, company_url, store, contact,_id,city,zone,street

staffNo = "https://anjuxingye1.anjuke.com/gongsi-jjr-%s/" %_id

mydict= {"_id": _id, "city": city, "zone": zone, "street": street, "name": name, "company": company,"company_url": company_url,"store": store, "site": "anjuke", "store_url": "", "staffNo": "", "store_url": "","staffNo": staffNo, "tag": "8", "all_comm": "","contact": contact}

insert_list.append(mydict)#del_list.append(urls)

print("size: %s" % insert_list.__len__())except:pass

print("%s processing %s" %(name, i))else:

queueLock.release()#time.sleep(1)

threadList= range(0,5)

queueLock=threading.Lock()

workQueue= queue.Queue(50000)

threads=[]

threadID= 1

for tName inthreadList:

thread=myThread(threadID, tName, workQueue)

thread.start()

threads.append(thread)

threadID+= 1

#填充队列

queueLock.acquire()for word inlist_urls:

workQueue.put(word)

queueLock.release()#等待队列清空

while notworkQueue.empty():pass

if insert_list.__len__() > 10:try:

table.insert_many(insert_list, ordered=False)#table_urls.remove({"urls": {"$in": del_list}})

print("插入1000")exceptException as e:print(e)

insert_list.clear()

del_list.clear()#通知线程是时候退出#os._exit(0)

exitFlag = 1

try:

table.insert_many(insert_list, ordered=False)#table_urls.remove({"urls": {"$in": del_list}})

print("插入1000")except:passinsert_list.clear()

del_list.clear()#等待所有线程完成

for t inthreads:

t.join()print ("退出主线程")

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值