#coding=utf-8
import random
import requests
import urllib.request as urllib
from lxml import etree
from bs4 import BeautifulSoup
user_agent_list = ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 ",
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11",
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6",
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6",
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5",
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5",
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3",
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3",
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3",
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3",
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3",
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3",
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3",
"(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24",
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 ",
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",]
class Crawler(object):
index = 1
def __init__(self,start_url):
self.start_url = start_url
@staticmethod
def request(url, **kwargs):
try:
page = requests.get(url,**kwargs)
return page.text
except:
return ''
@property
def get_max_page(self):
html = self.request(self.start_url)
html = etree.HTML(html)
pages = html.xpath('//li[@class = "l_reply_num"]//@max-page')
max_page,*_ = pages
return int(max_page)
def get_all_urls(self,max_page):
for i in range(1,max_page + 1):
yield (self.start_url + '?pn={}'.format(i))
@classmethod
def get_imgs(cls,html):
soup = BeautifulSoup(html, 'html.parser')
img_urls = soup.find_all('img', class_='BDE_Image')
for img in img_urls:
print ("正在下载第{}张图片".format(cls.index))
urllib.urlretrieve(img.get("src"),r'D:\pic\{}.jpg'.format(cls.index))
cls.index += 1
def run(self):
max_page = self.get_max_page
urls = self.get_all_urls(max_page)
for url in urls:
User_Agent = random.choice(user_agent_list) #伪装一下
headers = {'User-Agent':User_Agent}
html = self.request(url,headers = headers)
self.get_imgs(html)
if __name__=='__main__':
start_url = "https://tieba.baidu.com/p/3881236527" #可以换
crawler = Crawler(start_url)
crawler.run()
爬取百度贴吧某帖子的所有照片
最新推荐文章于 2020-04-17 16:40:18 发布