爬取小说
import requests
from lxml import etree
url = 'https://www.xbiquge.la/xiaoshuodaquan/'
res = requests.get(url=url)
ele = etree.HTML(res.text)
book_names = ele.xpath("//div[@class='novellist']/ul/li/a/text()")
book_urls = ele.xpath("//div[@class='novellist']/ul/li/a/@href")
for book_index,book_url in enumerate(book_urls):
res = requests.get(url=book_url)
res = res.content.decode('utf-8')
ele = etree.HTML(res)
chapter_names = ele.xpath("//div[@id='list']/dl/dd/a/text()")
chapter_urls = ele.xpath("//div[@id='list']/dl/dd/a/@href")
print(chapter_urls)
urlTop = 'https://www.xbiquge.la'
for chapter_index,chapter_url in enumerate(chapter_urls):
chapter_url = urlTop+chapter_url
print('网址是:'+chapter_url)
res = requests.get(url=chapter_url)
res = res.content.decode('utf-8')
ele = etree.HTML(res)
book_content = ele.xpath("//div[@id='content']/text()")
s=chapter_names[chapter_index]+'\n'
for i in book_content:
s += i
s +='\n'
print('正在爬虫《'+book_names[book_index]+"》的"+chapter_names[chapter_index])
print(s)
with open(book_names[book_index]+'.txt','a+',encoding='utf-8')as w:
w.write(s)
break
网上的爬取sogou图片
import requests
import json
import urllib
def getSogouImag(category,length,path):
n = length #个数
cate = category #类别
imgs = requests.get('http://pic.sogou.com/pics/channel/getAllRecomPicByTag.jsp?category='+cate+'&tag=%E5%85%A8%E9%83%A8&start=0&len='+str(n))
jd = json.loads(imgs.text)
jd = jd['all_items']
imgs_url = []
for j in jd:
imgs_url.append(j['bthumbUrl'])
m = 0
for img_url in imgs_url:
print('***** '+str(m)+'.jpg *****'+' Downloading...')
urllib.request.urlretrieve(img_url,path+str(m)+'.jpg') #用于下载指定的url到本地
m = m + 1
print('Download complete!')
getSogouImag('壁纸',2000,'d:/download/壁纸/')
未完成的爬取sogou图片
import requests
import json
import urllib
from lxml import etree
def getSogouImag(category,length,path):
# n = length
# cate = category
# imgs = requests.get('http://pic.sogou.com/pics/channel/getAllRecomPicByTag.jsp?category='+cate+'&tag=%E5%85%A8%E9%83%A8&start=0&len='+str(n))
# # print('imgs',imgs)
# jd = json.loads(imgs.text)
# # print('jd:',jd)
# jd = jd['all_items']
imgss_url = []
for i in range(100):
i=i+1
url='https://pic.sogou.com/d?query=%E5%A3%81%E7%BA%B8&forbidqc=&entityid=&preQuery=&rawQuery=&queryList=&st=&mode=13&cwidth=1920&cheight=1080&dm=4&did='+str(i)
imgss_url.append(url)
imgs_url = []
for url in imgss_url:
# print('xxxx',url)
res = requests.get(url)
# print(res.text)
ele = etree.HTML(res.text)
imgurl = ele.xpath("//div[@class='img-box']/a/@href")
print(imgurl)
imgs_url.append(imgurl)
break
m = 0
print(imgs_url)
# for img_url in imgs_url:
# print('***** '+str(m)+'.jpg *****'+' Downloading...')
# urllib.request.urlretrieve(img_url,path+str(m)+'.jpg') #用于下载指定的url到本地
# m = m + 1
print('Download complete!')
getSogouImag('壁纸',2000,'d:/download/壁纸/')