爬取虎扑社区-晒晒照片
-
网上看到这个消息,顺便想试试手就做了这个
-
环境是MacOS + Anaconda (python 3.7)
-
就是练练手,网不好的时候会有bug 提示:类型错误AttributeError
-
实现功能:
- 图片下载
- 翻页是直接修改url
就是简单的爬虫,但是有个很大的bug没有处理:虎扑社区的图片应该是ajax加载的,所以requests直接扒html源码下来的时候有些图片没有加载出来,那样的话下载下来的图片是默认的加载图片咯
有大佬会的话还请告诉我嗷~诶嘿
import requests
import os
import re
from bs4 import BeautifulSoup
import urllib
import time
from lxml import etree
baseUrl = "https://bbs.hupu.com"
startUrl = "https://bbs.hupu.com/selfie-"
pageNum = 1
headers = {
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
#//*[@id="ajaxtable"]/div[1]/ul/li[2]
proxies = {
'HTTP':'112.85.169.12:9999',
"HTTPS":"27.43.190.162:9999"
}
def getURLXPath(url):
time.sleep(1)
data = requests.get(url, headers=headers, proxies=proxies).content.decode('utf-8')
xpath_data = etree.HTML(data)
return xpath_data
pass
def createFloder(title):
is_exists = os.path.exists(title)
if not is_exists:
os.mkdir(title)
pass
def savePhoto(root, url):
if ".png" in url:
return
path = root + "/" + url[-10:]
if not os.path.exists(path):
coon = requests.get(url)
with open(path, 'wb') as f:
f.write(coon.content)
f.flush()
f.close()
pass
def getPageAllUrl(url):
xpath_data = getURLXPath(url)
lis = xpath_data.xpath('//a[@class="truetit"]')
for li in lis:
text = li.xpath('string(.)').strip()
url = li.xpath('@href')[0].strip()
print(text + " " + baseUrl+url)
getPageAllImg(text, baseUrl+url)
pass
def getPageAllImg(title, url):
xpath_data = getURLXPath(url)
imgs = xpath_data.xpath('//*[@id="tpc"]/div/div[2]/table[1]/tbody/tr/td/div[2]/p/img/@src')
createFloder("./" + title)
for img in imgs:
url = str(img).split(".jpg")[0]+".jpg"
savePhoto("./" + title, url)
print(url)
pass
def changePage():
for i in range(15):
url = startUrl + str(i+1)
print(url)
getPageAllUrl(url)
pass
if __name__ == '__main__':
changePage()
pass