就是改寒假计划

最新推荐文章于 2024-09-15 22:31:42 发布

叫地地不灵

最新推荐文章于 2024-09-15 22:31:42 发布

阅读量48

点赞数

文章标签： python Powered by 金山文档

本文链接：https://blog.csdn.net/weixin_71151941/article/details/129106299

版权

期货

import requests

import logging

logging.basicConfig(level=logging.INFO,

format='%(asctime)s - %(filename)s[line:%(lineno)d]-'

'%(levelname)s: %(message)s')

def gethtml(url, pageSize=1):

post_data = {

'flag': 'qh',

'prod_type': 'lntx】',

'pageNumber': pageSize,

'pageSize': '15'

}

reup = requests.post(url, data=post_data)

return reup

def paresparsehtml():

size = 1

count = 0

url = 'http://price.mofcom.gov.cn/datamofcom/front/price/pricequotation/codeDetailQuery'

while True:

reup = gethtml(url, size)

logging.info(reup.text)

if count == 0:

count = reup.json().get("maxPageNum")

rox_list = reup.json().get("rows")

for rox in rox_list:

with open('D://期货.csv', 'a', encoding='utf-8') as f:

f.write(rox.get("prod_name"))

f.write(rox.get("prod_spec"))

num = rox.get("seqno")

seqno_data = {

'seqno': num,

'startTime': '',

'endTime': '',

'pageNumber': '1',

'pageSize': '10000'

}

reup_seqno = requests.post(

f'http://price.mofcom.gov.cn/datamofcom/front/price/pricequotation/priceQueryList', data=seqno_data)

try:

money = reup_seqno.json().get('rows')[0].get('price') + reup_seqno.json().get('rows')[0].get('unit')

except:

logging.error('没有数据')

f.write('\n')

continue

f.write(money)

f.write('\n')

logging.info('爬取成功')

if count > size:

size = size + 1

else:

break

if __name__ == '__main__':

paresparsehtml()

天气

# http://www.envicloud.cn/dataMap?title=3

import requests

url = 'http://www.envicloud.cn/getAreaInfoByZoom.action?zoom=12'

def gethtml(url): # 获取html

header = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",

"Referer": "http://price.mofcom.gov.cn/pricequotation/morepricequotation.shtml?flag=qh&prod_type=lntx"}

resp = requests.post(url, headers=header)

return resp

def getdetail(html):

list = []

json = html.json() # 获取HTML的json格式

city = json.get('CITYINFO')

for i in city:

print(i)

lng = i.get('longitude') # 获取维度

lat = i.get('latitude')

resp = gethtml(f"http://www.envicloud.cn/getPointDetail.action?lng={lng}&lat={lat}") # 使用维度进行HTML的爬取

cityjson = resp.json()

citycode = cityjson.get('citycode')

with open('D://天气.csv', 'a', encoding='utf-8') as f: # 写入文件D://天气.csv

for city_0 in cityjson.values():

f.write(city_0 + ',')

cityhtml = gethtml(

f'http://www.envicloud.cn/getWeatherForecast.action?citycode={citycode}').json() # 爬取城市编号进行HTML爬取

for cityshuj in cityhtml.values():

f.write(cityshuj + ',')

f.write('\n')

if __name__ == '__main__':

getdetail(gethtml(url))

书籍列表

import requests

from bs4 import BeautifulSoup

import os

def gethtml():

for num in range(1,35):

url = f'http://www.bookschina.com/24hour/1_0_{num}/'

reup = requests.get(url)

yield reup.text

def parsehtml(html):

bsop = BeautifulSoup(html,'lxml')

all = bsop.find('div',class_="bookList")

li_list = all.find_all('li')

for li in li_list:

list = []

book = li.find('div',class_="infor")

list.append(book.find('h2').find('a').get('title'))

list.append(book.find('div',class_="author").find('a').text)

list.append(book.find('div',class_="priceWrap").find('span',class_="sellPrice").text)

list.append(book.find('div',class_="priceWrap").find('span',class_="discount").text)

tuphtml = li.find('div',class_="cover").find('img').get('data-original')

try:

tupa = requests.get(f'http:{tuphtml}')

tuname = tuphtml.split('/')[-1]

with open('D://书籍列表.csv','a',encoding='utf-8')as f:

f.writelines(list)

f.write('\n')

if os.path.exists('D://bookimgs'):

with open(f'D://bookimgs/{tuname}','wb')as f:

f.write(tupa.content)

else:

os.mkdir('D://bookimgs')

with open(f'D://bookimgs/{tuname}','wb')as f:

f.write(tupa.content)

except:

print('没有图片')

if __name__ == '__main__':

html_list = gethtml()

for html in html_list:

parsehtml(html)

书籍评论

import requests

from bs4 import BeautifulSoup

import logging

logging.basicConfig(level=logging.INFO,

format='%(asctime)s - %(filename)s[line:%(lineno)d]-'

'%(levelname)s: %(message)s')

def gethtml(bookdata):

for len_data in bookdata:

url = 'http://www.bookschina.com/ashx/GetMsg.ashx'

html = requests.post(url, data=len_data)

parsehtml(html.json().get('Html'))

def nextpage():

numhtml = requests.get('http://www.bookschina.com/8229303.htm#tabookReco')

num = BeautifulSoup(numhtml.text,'lxml').find('div',class_="p-skip").text

num = num[1]+num[2]

for len in range(int(num)):

bookdata = {

'_page': len+1,

'_bookid': '8229303',

'_totalPage': '207',

}

yield bookdata

def parsehtml(html):

bsup = BeautifulSoup(html, 'lxml')

itme = {}

all_li = bsup.find_all('li')

for li in all_li:

itme['name'] = li.find('a', target="_blank").text

itme['comment'] = li.find('p').text

itme['grade'] = len(li.find_all('i',class_="one"))

CommentOnTheTitle = li.find('div',class_="theme").text.split('：')

if CommentOnTheTitle[1] == '':

CommentOnTheTitle[1] = '无'

itme['CommentOnTheTitle'] = CommentOnTheTitle[1]

itme['time'] = li.find('span',class_="time").text

with open('D://书籍评论.csv', 'a', encoding='utf-8') as f:

f.writelines(str(itme.values()))

f.write('\n')

logging.info('over')

if __name__ == '__main__':

gethtml(nextpage())

新闻

import requests

import re

import os

def gethtml():

for i in range(3):

if i == 0:

url = 'https://news.163.com/special/cm_guonei/?callback=data_callback'

reqs = requests.get(url)

parsehtml(reqs.text)

else:

url = f'https://news.163.com/special/cm_guonei_0{i+1}/?callback=data_callback'

reqs = requests.get(url)

parsehtml(reqs.text)

def parsehtml(html):

all = re.findall('{.*?"t.*?(.*?)add3.*?}',html,re.S)

for data in all:

title = re.findall('itle.*?:"(.*?)",',data)

time = re.findall('time.*?:"(.*?)",',data)

guajzi = re.findall('keyname":"(.*?)"}',data)

gati = re.findall('tienum":(.*?),',data)

img = re.findall('imgurl.*?:"(.*?)"', data)

print(title)

print(guajzi)

with open('D://新闻.csv','a',encoding='utf-8') as f:

f.writelines(str([title,time,guajzi,gati]))

f.write('\n')

try:

if os.path.exists('D://news'):

with open('D://news/'+img[0].split('/')[-1],'wb')as f:

f.write(requests.get(url=img[0]).content)

else:

os.mkdir('D://news')

with open('D://news/'+img[0].split('/')[-1],'wb')as f:

f.write(requests.get(url=img[0]).content)

except:

pass

if __name__ == '__main__':

gethtml()

新闻列表

'''2.正确导入urllib等库

3.通过分析得到正确头部信息，以及网页构造

4.定义gethtml()函数，通过urllib库中对应方法获取该网页信息。

5.创建解析函数parsehtml(html)用于处理解析，使用Beautiful Soup获取到岗位名称，地区，工作，经验，学历，福利

6.把获取到的岗位名称，地区，工作，经验，学历，福利（福利信息每#分隔）保存到D://新闻列表.csv文件中'''

url = 'http://www.pjob.net/china.htm'

import requests

from bs4 import BeautifulSoup

def gethtml():

url = 'http://www.pjob.net/china.htm'

reup = requests.get(url)

return reup.text

def parsehtml(html):

beup = BeautifulSoup(html, 'lxml')

beuphtml = beup.find('ul', class_="hot_job_list")

all = beuphtml.find_all('a', class_="hot_post")

for itme in all:

with open('D://新闻列表.csv', 'a', encoding='utf-8') as f:

list = []

url = itme.get('href')

print(url)

name = itme.get('title')

print(name)

namehtml = requests.get(url)

namebsup = BeautifulSoup(namehtml.text, 'lxml')

allbsup = namebsup.find('div', class_="process_engineerfonts")

salay = allbsup.find('p').text

print(salay)

addres = allbsup.find_all('span')[0].text

print(addres)

experience = allbsup.find_all('span')[1].text

print(experience)

educationalbackground = allbsup.find_all('span')[2].text

print(educationalbackground)

wook = allbsup.find_all('span')[3].text

print(wook)

welfares = []

for welfare in allbsup.find_all('li'):

welfares.append(welfare.text)

welfares = '#'.join(welfares)

print(welfares)

f.writelines([name, salay, addres.strip(), experience, educationalbackground, wook, welfares])

if __name__ == '__main__':

html = gethtml()

parsehtml(html)

'class ="recommend_left fl" > … < / div >''<div class="process_engineerfonts">'

叫地地不灵

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫