中CE——Database

#-*- coding:utf-8 -*-
#Filename:中策大数据test
#Author:Guan
#Datetime:2018/11/24

#导报
import requests
from bs4 import BeautifulSoup
import json
import os
#获取网页数据
def get_html(url):
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36",
        "Cookie":"PHPSESSID=t7p6n6e5fppv6f3mrebfccsevd; Hm_lvt_d2634fb106dc720564524e04dfd88bec=1542972127,1543019252,1543191784,1543277236; Hm_lpvt_d2634fb106dc720564524e04dfd88bec=1543277998"
    }
    #获取网页源码
    response  = requests.get(url=url,headers=headers).content.decode()
    return response
#获取网页内容
def get_cont(html):
    #用CSS样式获取网页标签
    soup = BeautifulSoup(html,'lxml')
    list = soup.select('.content_nr')
    new=[]
    for i in list:
        #定义了一个空字典
        new_str={}
        try:
            new_str['area'] = i.select('td')[1].get_text().strip()
        except Exception:
            print()
        try:
            new_str['title'] = i.select('td')[2].get_text().strip()
        except Exception:
            print()
        try:
            new_str['jinzhan'] = i.select('td')[3].get_text().strip()
        except Exception:
            print()
        try:
            new_str['leixing'] = i.select('td')[4].get_text().strip()
        except Exception:
            print()
        try:
            new_str['monry'] = i.select('td')[5].get_text().strip()
        except Exception:
            print()
        try:
            new_str['pub_date'] = i.select('td')[6].get_text().strip()
        except Exception:
            print()
        new.append(new_str)
        #遍历列表
    for yuansu in new:
        # print(type(yuansu))
        yuansu_str = json.dumps(yuansu,ensure_ascii=False)
        print(yuansu_str)
        with open('C:\\Users\\admin123\\Desktop\\new.txt','a',encoding='utf-8') as f:
            f.write(yuansu_str+'\n')

if __name__ == '__main__':
    for k in range(1,29):
        url = "https://www.china0001.com.cn/project/p%d/"%k
        print('正在获取%d页数据'%k)
        html = get_html(url)
        get_cont(html)
    print('写入完成')
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值