中CE——Database

最新推荐文章于 2024-10-02 10:53:34 发布

qq_41757346

最新推荐文章于 2024-10-02 10:53:34 发布

阅读量197

点赞数

分类专栏：爬虫代码文章标签： python selenium bs4 json 系统自动化

本文链接：https://blog.csdn.net/qq_41757346/article/details/84791818

版权

爬虫代码专栏收录该内容

7 篇文章 0 订阅

订阅专栏

#-*- coding:utf-8 -*-
#Filename:中策大数据test
#Author:Guan
#Datetime:2018/11/24

#导报
import requests
from bs4 import BeautifulSoup
import json
import os
#获取网页数据
def get_html(url):
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36",
        "Cookie":"PHPSESSID=t7p6n6e5fppv6f3mrebfccsevd; Hm_lvt_d2634fb106dc720564524e04dfd88bec=1542972127,1543019252,1543191784,1543277236; Hm_lpvt_d2634fb106dc720564524e04dfd88bec=1543277998"
    }
    #获取网页源码
    response  = requests.get(url=url,headers=headers).content.decode()
    return response
#获取网页内容
def get_cont(html):
    #用CSS样式获取网页标签
    soup = BeautifulSoup(html,'lxml')
    list = soup.select('.content_nr')
    new=[]
    for i in list:
        #定义了一个空字典
        new_str={}
        try:
            new_str['area'] = i.select('td')[1].get_text().strip()
        except Exception:
            print()
        try:
            new_str['title'] = i.select('td')[2].get_text().strip()
        except Exception:
            print()
        try:
            new_str['jinzhan'] = i.select('td')[3].get_text().strip()
        except Exception:
            print()
        try:
            new_str['leixing'] = i.select('td')[4].get_text().strip()
        except Exception:
            print()
        try:
            new_str['monry'] = i.select('td')[5].get_text().strip()
        except Exception:
            print()
        try:
            new_str['pub_date'] = i.select('td')[6].get_text().strip()
        except Exception:
            print()
        new.append(new_str)
        #遍历列表
    for yuansu in new:
        # print(type(yuansu))
        yuansu_str = json.dumps(yuansu,ensure_ascii=False)
        print(yuansu_str)
        with open('C:\\Users\\admin123\\Desktop\\new.txt','a',encoding='utf-8') as f:
            f.write(yuansu_str+'\n')

if __name__ == '__main__':
    for k in range(1,29):
        url = "https://www.china0001.com.cn/project/p%d/"%k
        print('正在获取%d页数据'%k)
        html = get_html(url)
        get_cont(html)
    print('写入完成')