#-*- coding:utf-8 -*-
#Filename:中策大数据test
#Author:Guan
#Datetime:2018/11/24
#导报
import requests
from bs4 import BeautifulSoup
import json
import os
#获取网页数据
def get_html(url):
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36",
"Cookie":"PHPSESSID=t7p6n6e5fppv6f3mrebfccsevd; Hm_lvt_d2634fb106dc720564524e04dfd88bec=1542972127,1543019252,1543191784,1543277236; Hm_lpvt_d2634fb106dc720564524e04dfd88bec=1543277998"
}
#获取网页源码
response = requests.get(url=url,headers=headers).content.decode()
return response
#获取网页内容
def get_cont(html):
#用CSS样式获取网页标签
soup = BeautifulSoup(html,'lxml')
list = soup.select('.content_nr')
new=[]
for i in list:
#定义了一个空字典
new_str={}
try:
new_str['area'] = i.select('td')[1].get_text().strip()
except Exception:
print()
try:
new_str['title'] = i.select('td')[2].get_text().strip()
except Exception:
print()
try:
new_str['jinzhan'] = i.select('td')[3].get_text().strip()
except Exception:
print()
try:
new_str['leixing'] = i.select('td')[4].get_text().strip()
except Exception:
print()
try:
new_str['monry'] = i.select('td')[5].get_text().strip()
except Exception:
print()
try:
new_str['pub_date'] = i.select('td')[6].get_text().strip()
except Exception:
print()
new.append(new_str)
#遍历列表
for yuansu in new:
# print(type(yuansu))
yuansu_str = json.dumps(yuansu,ensure_ascii=False)
print(yuansu_str)
with open('C:\\Users\\admin123\\Desktop\\new.txt','a',encoding='utf-8') as f:
f.write(yuansu_str+'\n')
if __name__ == '__main__':
for k in range(1,29):
url = "https://www.china0001.com.cn/project/p%d/"%k
print('正在获取%d页数据'%k)
html = get_html(url)
get_cont(html)
print('写入完成')
中CE——Database
最新推荐文章于 2024-10-02 10:53:34 发布