期货
import requests
import logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(filename)s[line:%(lineno)d]-'
'%(levelname)s: %(message)s')
def gethtml(url, pageSize=1):
post_data = {
'flag': 'qh',
'prod_type': 'lntx】',
'pageNumber': pageSize,
'pageSize': '15'
}
reup = requests.post(url, data=post_data)
return reup
def paresparsehtml():
size = 1
count = 0
url = 'http://price.mofcom.gov.cn/datamofcom/front/price/pricequotation/codeDetailQuery'
while True:
reup = gethtml(url, size)
logging.info(reup.text)
if count == 0:
count = reup.json().get("maxPageNum")
rox_list = reup.json().get("rows")
for rox in rox_list:
with open('D://期货.csv', 'a', encoding='utf-8') as f:
f.write(rox.get("prod_name"))
f.write(rox.get("prod_spec"))
num = rox.get("seqno")
seqno_data = {
'seqno': num,
'startTime': '',
'endTime': '',
'pageNumber': '1',
'pageSize': '10000'
}
reup_seqno = requests.post(
f'http://price.mofcom.gov.cn/datamofcom/front/price/pricequotation/priceQueryList', data=seqno_data)
try:
money = reup_seqno.json().get('rows')[0].get('price') + reup_seqno.json().get('rows')[0].get('unit')
except:
logging.error('没有数据')
f.write('\n')
continue
f.write(money)
f.write('\n')
logging.info('爬取成功')
if count > size:
size = size + 1
else:
break
if __name__ == '__main__':
paresparsehtml()
天气
# http://www.envicloud.cn/dataMap?title=3
import requests
url = 'http://www.envicloud.cn/getAreaInfoByZoom.action?zoom=12'
def gethtml(url): # 获取html
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
"Referer": "http://price.mofcom.gov.cn/pricequotation/morepricequotation.shtml?flag=qh&prod_type=lntx"}
resp = requests.post(url, headers=header)
return resp
def getdetail(html):
list = []
json = html.json() # 获取HTML的json格式
city = json.get('CITYINFO')
for i in city:
print(i)
lng = i.get('longitude') # 获取维度
lat = i.get('latitude')
resp = gethtml(f"http://www.envicloud.cn/getPointDetail.action?lng={lng}&lat={lat}") # 使用维度进行HTML的爬取
cityjson = resp.json()
citycode = cityjson.get('citycode')
with open('D://天气.csv', 'a', encoding='utf-8') as f: # 写入文件D://天气.csv
for city_0 in cityjson.values():
f.write(city_0 + ',')
cityhtml = gethtml(
f'http://www.envicloud.cn/getWeatherForecast.action?citycode={citycode}').json() # 爬取城市编号进行HTML爬取
for cityshuj in cityhtml.values():
f.write(cityshuj + ',')
f.write('\n')
if __name__ == '__main__':
getdetail(gethtml(url))
书籍列表
import requests
from bs4 import BeautifulSoup
import os
def gethtml():
for num in range(1,35):
url = f'http://www.bookschina.com/24hour/1_0_{num}/'
reup = requests.get(url)
yield reup.text
def parsehtml(html):
bsop = BeautifulSoup(html,'lxml')
all = bsop.find('div',class_="bookList")
li_list = all.find_all('li')
for li in li_list:
list = []
book = li.find('div',class_="infor")
list.append(book.find('h2').find('a').get('title'))
list.append(book.find('div',class_="author").find('a').text)
list.append(book.find('div',class_="priceWrap").find('span',class_="sellPrice").text)
list.append(book.find('div',class_="priceWrap").find('span',class_="discount").text)
tuphtml = li.find('div',class_="cover").find('img').get('data-original')
try:
tupa = requests.get(f'http:{tuphtml}')
tuname = tuphtml.split('/')[-1]
with open('D://书籍列表.csv','a',encoding='utf-8')as f:
f.writelines(list)
f.write('\n')
if os.path.exists('D://bookimgs'):
with open(f'D://bookimgs/{tuname}','wb')as f:
f.write(tupa.content)
else:
os.mkdir('D://bookimgs')
with open(f'D://bookimgs/{tuname}','wb')as f:
f.write(tupa.content)
except:
print('没有图片')
if __name__ == '__main__':
html_list = gethtml()
for html in html_list:
parsehtml(html)
书籍评论
import requests
from bs4 import BeautifulSoup
import logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(filename)s[line:%(lineno)d]-'
'%(levelname)s: %(message)s')
def gethtml(bookdata):
for len_data in bookdata:
url = 'http://www.bookschina.com/ashx/GetMsg.ashx'
html = requests.post(url, data=len_data)
parsehtml(html.json().get('Html'))
def nextpage():
numhtml = requests.get('http://www.bookschina.com/8229303.htm#tabookReco')
num = BeautifulSoup(numhtml.text,'lxml').find('div',class_="p-skip").text
num = num[1]+num[2]
for len in range(int(num)):
bookdata = {
'_page': len+1,
'_bookid': '8229303',
'_totalPage': '207',
}
yield bookdata
def parsehtml(html):
bsup = BeautifulSoup(html, 'lxml')
itme = {}
all_li = bsup.find_all('li')
for li in all_li:
itme['name'] = li.find('a', target="_blank").text
itme['comment'] = li.find('p').text
itme['grade'] = len(li.find_all('i',class_="one"))
CommentOnTheTitle = li.find('div',class_="theme").text.split(':')
if CommentOnTheTitle[1] == '':
CommentOnTheTitle[1] = '无'
itme['CommentOnTheTitle'] = CommentOnTheTitle[1]
itme['time'] = li.find('span',class_="time").text
with open('D://书籍评论.csv', 'a', encoding='utf-8') as f:
f.writelines(str(itme.values()))
f.write('\n')
logging.info('over')
if __name__ == '__main__':
gethtml(nextpage())
新闻
import requests
import re
import os
def gethtml():
for i in range(3):
if i == 0:
url = 'https://news.163.com/special/cm_guonei/?callback=data_callback'
reqs = requests.get(url)
parsehtml(reqs.text)
else:
url = f'https://news.163.com/special/cm_guonei_0{i+1}/?callback=data_callback'
reqs = requests.get(url)
parsehtml(reqs.text)
def parsehtml(html):
all = re.findall('{.*?"t.*?(.*?)add3.*?}',html,re.S)
for data in all:
title = re.findall('itle.*?:"(.*?)",',data)
time = re.findall('time.*?:"(.*?)",',data)
guajzi = re.findall('keyname":"(.*?)"}',data)
gati = re.findall('tienum":(.*?),',data)
img = re.findall('imgurl.*?:"(.*?)"', data)
print(title)
print(guajzi)
with open('D://新闻.csv','a',encoding='utf-8') as f:
f.writelines(str([title,time,guajzi,gati]))
f.write('\n')
try:
if os.path.exists('D://news'):
with open('D://news/'+img[0].split('/')[-1],'wb')as f:
f.write(requests.get(url=img[0]).content)
else:
os.mkdir('D://news')
with open('D://news/'+img[0].split('/')[-1],'wb')as f:
f.write(requests.get(url=img[0]).content)
except:
pass
if __name__ == '__main__':
gethtml()
新闻列表
'''2.正确导入urllib等库
3.通过分析得到正确头部信息,以及网页构造
4.定义gethtml()函数,通过urllib库中对应方法获取该网页信息。
5.创建解析函数parsehtml(html)用于处理解析,使用Beautiful Soup获取到岗位名称,地区,工作,经验,学历,福利
6.把获取到的岗位名称,地区,工作,经验,学历,福利(福利信息每#分隔)保存到D://新闻列表.csv文件中'''
url = 'http://www.pjob.net/china.htm'
import requests
from bs4 import BeautifulSoup
def gethtml():
url = 'http://www.pjob.net/china.htm'
reup = requests.get(url)
return reup.text
def parsehtml(html):
beup = BeautifulSoup(html, 'lxml')
beuphtml = beup.find('ul', class_="hot_job_list")
all = beuphtml.find_all('a', class_="hot_post")
for itme in all:
with open('D://新闻列表.csv', 'a', encoding='utf-8') as f:
list = []
url = itme.get('href')
print(url)
name = itme.get('title')
print(name)
namehtml = requests.get(url)
namebsup = BeautifulSoup(namehtml.text, 'lxml')
allbsup = namebsup.find('div', class_="process_engineerfonts")
salay = allbsup.find('p').text
print(salay)
addres = allbsup.find_all('span')[0].text
print(addres)
experience = allbsup.find_all('span')[1].text
print(experience)
educationalbackground = allbsup.find_all('span')[2].text
print(educationalbackground)
wook = allbsup.find_all('span')[3].text
print(wook)
welfares = []
for welfare in allbsup.find_all('li'):
welfares.append(welfare.text)
welfares = '#'.join(welfares)
print(welfares)
f.writelines([name, salay, addres.strip(), experience, educationalbackground, wook, welfares])
if __name__ == '__main__':
html = gethtml()
parsehtml(html)
'class ="recommend_left fl" > … < / div >''<div class="process_engineerfonts">'