简单的python多线程爬虫

# -*-coding: utf-8 -*-
import requests
import re
import urllib3
from bs4 import BeautifulSoup
import re
import xlwt
import threadpool
import threading
import time
lock = threading.Lock()

urllib3.disable_warnings()
cnt = 0
url = 'https://ip/accounts/login_view/'
session = requests.session()
session.verify = False
session.headers.update({
	'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0',
	'Referer': 'https://200.200.226.74/accounts/login_view/'
})
r = session.get(url)
pattern = re.compile("<input type='hidden' name='csrfmiddlewaretoken' value=\"([A-Za-z0-9]{32})\">")
result = pattern.findall(r.text)

data = {
	'username': 'admin',
	'password': '123',
	'csrfmiddlewaretoken': result[0]
}
r = session.post(url, data=data)
book = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = book.add_sheet('test', cell_overwrite_ok=True)
def lalalala(i):
	flag = False
	url1 = 'https://200.200.226.74/template/show_vul_desc?id=%d' % i
	print(url1)
	r = session.get(url1)
	soup = BeautifulSoup(r.text, "html.parser")
	cntt = 0
	global cnt
	lock.acquire()
	temp = cnt
	time.sleep(0.001)
	for x in soup.find_all('td'):
		string = x.get_text()
		sheet.write(cnt, cntt, string)
		cntt = cntt + 1
		flag = True
	if flag:
		cnt = temp + 1
	lock.release()

work_list = []
for i in range(90000, 90500):
	work_list.append(i)
pool = threadpool.ThreadPool(100)
request = threadpool.makeRequests(lalalala, work_list)
[pool.putRequest(req) for req in request]
pool.wait()
book.save(r'C:\Users\q\Desktop\test2.xls')

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值