1.上次做了一个利用Instaloader下载数据的示例,我发现这方面的教程很少,由于单线程下载速度慢,我这里利用多线程来加速下载数据,注意worker的数量不要设置太多,不然会报一下错误,错误信息为:
JSON Query to explore/locations/245942146/: 429 Too Many Requests
2.如果想知道hashtags.txt里面是啥,请参考前面的博客:https://blog.csdn.net/w5688414/article/details/85061680
还是一样前提是租一台国外的虚拟主机,国内下载不了,以下为多线程下载,文件名为demo.oy
from threading import Thread
from time import time, sleep
from queue import Queue
from datetime import datetime
import instaloader
# for HASHTAG in hashtags:
# try:
# posts = L.get_hashtag_posts(HASHTAG)
# count=0
# print(HASHTAG)
# for post in posts:
# if(post.is_video):
# continue
# if(count==1000):
# break
# # print(post.date)
# L.download_post(post, target='#'+HASHTAG)
# count+=1
# except Exception as e:
# print(e)
L = instaloader.Instaloader()
def download_tweets(HASHTAG):
try:
posts = L.get_hashtag_posts(HASHTAG)
count=0
print(HASHTAG)
for post in posts:
if(post.is_video):
continue
if(count==1000):
break
# print(post.date)
L.download_post(post, target='#'+HASHTAG)
count+=1
except Exception as e:
print(e)
fp = open("error.txt", "a")
fp.write(str(e)+"\n")
fp.close()
class DownloadWorker(Thread):
def __init__(self, queue,sleep=1):
Thread.__init__(self)
self.queue = queue
self.numPicrures=0
self.sleep = sleep
def run(self):
while True:
# Get the work from the queue and expand the tuple
item = self.queue.get()
if item is None:
break
# print(imageUrl)
download_tweets(item)
self.queue.task_done()
sleep(self.sleep)
if __name__ == "__main__":
with open('hashtags.txt', encoding="utf-8") as f:
examples=f.readlines()
hashtags=[]
for item in examples:
hashtag=item.strip().replace('#','')
hashtags.append(hashtag)
ts = time()
queue = Queue()
for x in range(5):
worker = DownloadWorker(queue,2)
# Setting daemon to True will let the main thread exit even though the
# workers are blocking
worker.daemon = True
worker.start()
for hashtag in hashtags:
queue.put(hashtag)
queue.join()
print('Took {}s'.format(time() - ts))