用python爬取樱花动漫的图片等信息用于做毕业设计的数据
学习视频:
代码
import requests from lxml import etree # 判断单个字符是否是中文 def is_chinese(char): if '\u4e00' <= char <= '\u9fff': return True else: return False domain = "https://www.iyhdmw.com" url = "https://www.iyhdmw.com/" resp = requests.get(url) # print(resp.text) et = etree.HTML(resp.text) # 图片主键 id = 1703792039038+1 result = et.xpath("//div[@class='area']/div/div/ul/li/a/@href") #得到我所需要的每个动漫的详情页的地址 print(result) for item in result: url = domain + item # 便利每个动漫详情页 # print(url) resp = requests.get(url) et = etree.HTML(resp.text) # 拿到页面的源代码 result = "insert into remendongman values (\'"+ str(id) +"\',\'2021-01-05 11:42:41\',\'"+et.xpath("//head/meta[@name='keywords']/@content")[0]+"\',\'分类1\'," id = id + 1 result += "\'" # 拼接动漫标签,可能有多个,同理拼接其他信息直接构成sql语句 for j in et.xpath("//div[@class='sinfo']/span/a/@href"): if is_chinese(j[-2]): result += j[-2:]+" " result += "\',\'" tupian = "https:"+ et.xpath("//div[@class='area']/div[@class='fire l']/div/img/@src")[0] result += tupian + "\'" result += ",\'\',\'\',\'2023-12-27\',\'" result += et.xpath("//head/meta[@name='description']/@content")[0] result += "\'" result += ",\'0\',\'0\',\'2023-12-28 17:59:21\',\'0\');" print(result) # https://fc.sinaimg.cn/large/008vY4jCgy1hfmbfazy5hj305i07njro.jpg # https://www.iyhdmw.com/fc.sinaimg.cn/large/008vY4jCgy1hfmbfazy5hj305i07njro.jpg