python学习4

最新推荐文章于 2024-10-30 19:27:31 发布

瀑听石

最新推荐文章于 2024-10-30 19:27:31 发布

阅读量420

点赞数 3

文章标签： python

本文链接：https://blog.csdn.net/qq_42197919/article/details/138220453

版权

# -*- coding: utf-8 -*-
# @Time : 2024/4/2 19:22
# @Author : wangz
# @File : my_crawler.py
import requests
from bs4 import BeautifulSoup
import json
import csv
import datetime
import urllib

url = 'https://www.ycjsxy.com'
headers = {"User-Agent":
               "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0"}
response = requests.get(url, headers=headers)
print('response: ', response)
html_text = response.content
# print('html_text: ', html_text)
soup = BeautifulSoup(html_text, 'html.parser', )
print(soup.title.text)


# 用来保存提取v胡来的video

class Video():
    def __init__(self, title, href, img_url):
        self.title = title
        self.href = href
        self.img_url = img_url


# 提取列表
items = soup.findAll('li', {"class": "col-md-3 col-sm-6 col-xs-12"})
videos = []
numbers_pic = 0
for item in items:
    label_items = item.find('div', {'class': 'label_item'})
    print(label_items)
    # titles = label_items.find('div', {'class': 'tit'})
    # print("titles1:", titles)
    # titles = label_items.find('div', {'class': 'tit'}).find('h2').find('a').get('title')
    # print("titles:", titles)
    # href = label_items.find('div', {'class': 'tit'}).find('h2').find('a').get('href')
    # print("href:", href)
    # img_url = url + label_items.find('a').find('img').get('src')
    # print("img_url:", img_url)
    # numbers_pic += 1
    # v = Video(titles, href, img_url)
    # videos.append(v)
    # with open('get.json', 'w', encoding='utf-8') as f:
    #     json.dump(titles, f)
file_name = "new.csv"
with open(file_name, 'w', newline='') as f:
    pen = csv.writer(f)
    pen.writerow(['titles', 'href', 'img_url'])
    for v in videos:
        pen.writerow([v.title, v.href, v.img_url])
pic_name = 0
for v in videos:
    # with open('./' + str(pic_name) + '.jpg', 'w') as pic:
    #     pic.write(v.img_url)
    #     pic_name += 1
    urllib.request.urlretrieve(v.img_url, filename='./' + str(pic_name) + '.jpg')
    pic_name += 1
# https://www.ycjsxy.com/html/945/2024-03-23/content-8121.html
# https://www.ycjsxy.com/attachment/cms/item/2024_03/23_14/cd439a6e960e6076.jpg.cthumb.jpg