1.抓取教务处主界面,存于一个txt文档中.
import requests
file_path = r"E:\教务处.txt"
try:
kv = {'user-agent':'Mozilla/5.0'}
r = requests.get("http://jwch.sdut.edu.cn/", headers=kv)
r.raise_for_status()
r.encoding = r.apparent_encoding
with open(file_path, 'w') as file_obj:
file_obj.write(r.text)
except:
print("爬取失败")
2.百度搜索关键字。
import requests
try:
kv = {'wd':'Python'}
r = requests.get("http://www.baidu.com/s", params=kv)#百度的关键词接口:http://www.baidu.com/s?wd=keyword
print(r.request.url)
r.raise_for_status()
print(len(r.text))
except:
print("爬取失败")
3.抓取图片。
import requests
import os
url = "http://img1001.pocoimg.cn/image/poco/works/36/2018/0307/21/15204284272111499_46378737_H1920.jpg"
root = 'E://pics//'
image_path = root + url.split('/')[-1]
try:
r = requests.get(url)
r.raise_for_status()
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(image_path):
r = requests.get(url)
with open(image_path, 'wb') as file_obj:
file_obj.write(r.content)
print('图片保留成功')
except:
print("爬取失败")
4. ip138 爬取
import requests
url = "http://m.ip138.com/ip.asp?ip="#ip138 查询接口
ip = '202.204.80.112'
try:
r = requests.get(url + ip)
r.raise_for_status()
r.encoding= r.apparent_encoding
print(r.text[-500:])
print("爬取成功!")
except:
print("爬取失败!")
5.抓取中国大学排名
from bs4 import BeautifulSoup
import requests
import bs4
kv = {"user-agent":"Mozilla/5.0"}
def getHTMLText(url):
try:
r = requests.get(url, headers = kv, timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print("getHTMLText fail")
return ""
def fillUnivList(ulist, html):
soup = BeautifulSoup(html, "html.parser")
for tr in soup.find('tbody').children:
if isinstance(tr, bs4.element.Tag):
tds = tr('td')
ulist.append([tds[0].string, tds[1].string, tds[3].string])
def printUnivList(ulist, num):
prmod = "{0:^10}\t {1:{3}^10}\t {2:{3}^10}\t"
print(prmod.format("排名","学校", "总分", chr(12288)))
for i in range(num):
print(prmod.format(ulist[i][0],ulist[i][1], ulist[i][2], chr(12288)))
def main():
uinfo = []
url = "http://www.zuihaodaxue.com/zuihaodaxuepaiming2016.html"
html = getHTMLText(url)
fillUnivList(uinfo, html)
printUnivList(uinfo, 20)
main()
6. 淘宝抓取商品信息
import re
import requests
def getHtml(url):
try:
kv = {"ueser-agent":"Mozalli/5.0"}
r = requests.get(url, timeout = 30, headers = kv)
r.encoding = r.apparent_encoding
r.raise_for_status()
return r.text
except:
print("getHtml faild.")
return ""
def parserHtml(html, good_list):
regename = r'"raw_title":".*?"'
regexprice = r'"view_price":"[\d.]*"'
regexn = re.compile(regename)
regexp = re.compile(regexprice)
names = regexn.findall(html)
prices = regexp.findall(html)
for i in range(len(names)):
name = eval(names[i].split(":")[1])
price = eval(prices[i].split(":")[1])
good_list.append([name, price])
def display(good_list):
print_mode = "{0:{3}<4}\t{1:{3}<16}\t {2:{3}<8}\t"
cnt = 1
for i in range(len(good_list)):
print(print_mode.format(cnt, good_list[i][1], good_list[i][0], chr(12288)))
cnt += 1
def main():
name = input("输入货物名:")
raw_url = "https://s.taobao.com/search?q=" + name
base = 44
num = input("输入查询深度:")
num = int(num)
cnt = 1
good_list = []
print_mode = "{0:{3}<4}\t{1:{3}<16}\t {2:{3}<8}\t"
print(print_mode.format("序号", "价格", "商品名", chr(12288)))
for i in range(num):
try:
html = getHtml(raw_url + '&s=' + str(num * i))
parserHtml(html, good_list)
except:
continue
good_list.sort(key = lambda a: float(a[1]))
display(good_list)
main()
7.爬去股票信息
import re
import requests
from bs4 import BeautifulSoup
urllist = "http://quote.eastmoney.com/stocklist.html"
urlbaidu = "https://gupiao.baidu.com/stock/"
def getHtml(url):
kv = {"user-agent":"Mozilla/5.0"}
try:
r = requests.get(url, headers=kv)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ''
def getStockList():
html = getHtml(urllist)
soup = BeautifulSoup(html, "html.parser")
tmp = soup.find('div', attrs={'class':'qox'})
tagA = tmp.find_all('div', attrs={'class':'quotebody'})
tagA = tmp.find_all('a')
regex = r'[s][hz]\d{6}'
regex = re.compile(regex)
stockid = []
for a in tagA:
try:
href = a.attrs['href']
sid = regex.findall(href)[0]
stockid.append(sid)
except:
continue
return stockid
def getinfoDict():
stockid = getStockList()
stockList =[]
for id in stockid:
try:
infoDict = {}
url = urlbaidu + id + '.html'
html = getHtml(url)
if html == '':
continue
soup = BeautifulSoup(html, 'html.parser')
tables = soup.find('div', attrs={'class': 'stock-bets'})
name = tables.find(attrs={'class': 'bets-name'}).text.split()[0]
infoDict.update({"股票名称": name})
print("股票名称:%s %s" % (infoDict["股票名称"], id))
div = tables.find('div', attrs={'class': 'bets-content'})
dts = div.find_all('dt')
dds = div.find_all('dd')
for i in range(len(dts)):
print(dts[i].string + ':' + dds[i].string)
infoDict[dts[i].string] = dds[i].string
stockList.append(infoDict)
except:
continue
return stockList
getinfoDict()