# -*- coding: utf-8 -*-
# @Time : 2024/4/2 19:22
# @Author : wangz
# @File : my_crawler.py
import requests
from bs4 import BeautifulSoup
import json
import csv
import datetime
import urllib
url = 'https://www.ycjsxy.com'
headers = {"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0"}
response = requests.get(url, headers=headers)
print('response: ', response)
html_text = response.content
# print('html_text: ', html_text)
soup = BeautifulSoup(html_text, 'html.parser', )
print(soup.title.text)
# 用来保存提取v胡来的video
class Video():
def __init__(self, title, href, img_url):
self.title = title
self.href = href
self.img_url = img_url
# 提取列表
items = soup.findAll('li', {"class": "col-md-3 col-sm-6 col-xs-12"})
videos = []
numbers_pic = 0
for item in items:
label_items = item.find('div', {'class': 'label_item'})
print(label_items)
# titles = label_items.find('div', {'class': 'tit'})
# print("titles1:", titles)
# titles = label_items.find('div', {'class': 'tit'}).find('h2').find('a').get('title')
# print("titles:", titles)
# href = label_items.find('div', {'class': 'tit'}).find('h2').find('a').get('href')
# print("href:", href)
# img_url = url + label_items.find('a').find('img').get('src')
# print("img_url:", img_url)
# numbers_pic += 1
# v = Video(titles, href, img_url)
# videos.append(v)
# with open('get.json', 'w', encoding='utf-8') as f:
# json.dump(titles, f)
file_name = "new.csv"
with open(file_name, 'w', newline='') as f:
pen = csv.writer(f)
pen.writerow(['titles', 'href', 'img_url'])
for v in videos:
pen.writerow([v.title, v.href, v.img_url])
pic_name = 0
for v in videos:
# with open('./' + str(pic_name) + '.jpg', 'w') as pic:
# pic.write(v.img_url)
# pic_name += 1
urllib.request.urlretrieve(v.img_url, filename='./' + str(pic_name) + '.jpg')
pic_name += 1
# https://www.ycjsxy.com/html/945/2024-03-23/content-8121.html
# https://www.ycjsxy.com/attachment/cms/item/2024_03/23_14/cd439a6e960e6076.jpg.cthumb.jpg
python学习4
最新推荐文章于 2024-10-30 19:27:31 发布