爬取汽车之家新闻页面

<h3>爬取内容保存在数据库以为遇到各种坑</h3>

import requests,os,pymysql
from bs4 import BeautifulSoup
response = requests.get('https://www.autohome.com.cn/news/')
response.encoding = 'gbk'

soup = BeautifulSoup(response.text, 'html.parser')

#爬取主体内容
sql_connet = pymysql.connect(
    host='127.0.0.1',
    port=3306,
    user='root',
    password='root',
    database='autohome',
    charset='utf8',
)

# cursor = sql_connet.cursor()
# sql = "INSERT INTO info (title,body,a_src,img_src) VALUES ('%s','%s','%s','%s');" % ("title",
#                                                                                      "body",
#                                                                                      "a_src",
#                                                                                      "img_src")
# sql = "SELECT * FROM info;"
# sql='insert into info(title,body,a_src,img_src) values (%s,%s,%s,%s);'
# res=cursor.executemany(sql,[("root","123456",'3','4')])
# print(sql)
# cursor.execute(sql)
# print(cursor.execute(sql))
# sql_connet.commit()
#存一个大列表,用来放数据
info_list = []
div = soup.find(name='div',attrs={"id":'auto-channel-lazyload-article'})
li_list = div.find_all(name='li') #找出来是一个列表
for li in li_list:
    title = li.find(name='h3') #拿到标题
    if not title:
        continue
    p = li.find(name='p') #拿到简介
    a = li.find(name='a') #拿到超链接a.attrs是一个字典
    a_full = 'https:' + a.attrs.get('href')

    img = li.find(name='img')
    img_full = 'https:' + img.attrs.get('src')
    img_name = img_full.rsplit('/',maxsplit=1)[1]
    # filepath = 'img/' + img_name
    # with open(filepath,'wb') as f:
    #     f.write(requests.get(img_full).content)
    #在sql中插入数据进去
    info_dict = {}#放一个空字典
    info_dict["title"] = title.text
    info_dict["body"] = p.text
    info_dict["a_src"] = a_full
    info_dict["img_src"] = img_full
    info_list.append(info_dict)

cursor = sql_connet.cursor()
for i in info_list:
    print(i)

    sql = "INSERT INTO info (title,body,a_src,img_src) VALUES ('%s','%s','%s','%s');"%(i["title"],
                                                                                       i["body"],
                                                                                       i["a_src"],
                                                                                       i["img_src"])

    print(sql)
    cursor.execute(sql)
sql_connet.commit()      #提交################
cursor.close()
sql_connet.close()

转载于:https://www.cnblogs.com/xxy614899502/p/10116716.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值