#普通段子爬虫
import urllib.request
import re
import urllib.error
headers=("User- Agent","Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3641.400 QQBrowser/10.4.3284.400")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)
for i in range(1,36):
url="https://www.qiushibaike.com/text/page/"+str(i)
pagedata=urllib.request.urlopen(url).read().decode("utf-8","ignore")
pat='<div class="content">.*?<span>(.*?)</span>.*?</div>'
datalist=re.compile(pat,re.S).findall(pagedata)
for j in range(0,len(datalist)):
print("第"+str(i)+"页第"+str(j)+"个段子内容是: ")
print(datalist[j])
#多线程爬虫
import threading
class A(threading.Thread): #设置多个线程 A为一个线程
def __init__(self):
threading.Thread.__init__(self) #初始化线程
def run(self): #线程具体方法
for i in range(0,10):
print('im A')
class B(threading.Thread): #第二个线程
def __init__(self):
threading.Thread.__init__(self)
def run(self):
for i in range(0,10):
print('im B')
t1 = A() #调用线程
t1.start() #开启线程
t2 = B()
t2.start()
#可知两个线程交叉执行
#7、多线程爬虫
最新推荐文章于 2024-01-05 00:36:33 发布