一个简单的表情包爬取程序
图源网站斗图吧
工具requests库
正则表达式切割
import requests
from lxml import etree
from urllib import request
import os
import re
def main():
for x in range(1,101):
url = "http://www.doutula.com/photo/list/?page=%d" %x
parse_page(url)
def parse_page(url):
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5478.400 QQBrowser/10.1.1550.400"
}
response = requests.get(url,headers = headers)
text = response.text
what = etree.HTML(text)
imgs = what.xpath("//body//img")
for img in imgs:
img_url = img.get("data-original")
alt = img.get("alt")
alt = re.sub(r'[\??.,。!!]','',alt)
suffixs = os.path.splitext(img_url)[1]
suffix = suffixs.split('!')[0]
filename = alt +suffix
request.urlretrieve(img_url,"image/"+filename)
if __name__ == '__main__':
main()