案例一
import requests
import pymysql
import pymongo
from lxml import etree
from bs4 import BeautifulSoup
"""
存入mysql
"""
# class Test:
# def __init__(self):
# self.url = 'https://www.zhihu.com/explore'
# self.headers = {
# 'user-agent': ''
# }
# self.db = pymysql.connect(host='localhost', user='root', password='', port=3306, db='python',
# charset='utf8')
# self.cursor = self.db.cursor()
#
# def get_data(self):
# response = requests.get(self.url, headers=self.headers)
# html = response.text
# return html
#
# def parse_data(self, html):
# # element = etree.HTML(html)
# # div_list = element.xpath('//*[@id="square"]/div[2]/div/div[1]/div[2]/div/div')
# # print(div_list)
# soup = BeautifulSoup(html, 'lxml')
# hot_list = soup.select('div .css-1g4zjtl a')
# # print(hot_list)
# for i in hot_list:
# question = i.get_text()
# self.save_data(question)
#
# def create_table(self):
# create_table_sql = """
# create table if not exists zhihu_hot
# (id int unsigned primary key auto_increment,
# question varchar(100) not null)
# """
# try:
# self.cursor.execute(create_table_sql)
# print('创建表成功')
# except Exception as e:
# print('创建表失败:', e)
#
# def save_data(self, question):
# insert_sql = """
# insert into zhihu_hot(id, question) values(%s, %s)
# """
# try:
# self.cursor.execute(insert_sql, (0, question))
# self.db.commit()
# print('插入数据成功')
# except Exception as e:
# print('插入数据失败:', e)
# self.db.rollback()
#
# def main(self):
# self.create_table()
# html = self.get_data()
# self.parse_data(html)
# self.cursor.close()
# self.db.close()
#
# if __name__ == '__main__':
# t = Test()
# t.main()
"""
存入mongodb
"""
# class Test2:
# def __init__(self):
# self.url = 'https://www.zhihu.com/explore'
# self.headers = {
# 'user-agent': ''
# }
# self.client = pymongo.MongoClient(host='localhost', port=27017)
# self.collection = self.client.python.zhihu_hot
#
# def get_data(self):
# response = requests.get(self.url, headers=self.headers)
# html = response.text
# return html
#
# def parse_data(self, html):
# # element = etree.HTML(html)
# # div_list = element.xpath('//*[@id="square"]/div[2]/div/div[1]/div[2]/div/div')
# # print(div_list)
# soup = BeautifulSoup(html, 'lxml')
# hot_list = soup.select('div .css-1g4zjtl a')
# # print(hot_list)
# for i in hot_list:
# item = {}
# item['question'] = i.get_text()
# self.save_data(item)
#
# def save_data(self, item):
# self.collection.insert_one(item)
# print('插入数据成功')
#
# def main(self):
# html = self.get_data()
# self.parse_data(html)
#
# if __name__ == '__main__':
# t = Test2()
# t.main()
案例二
import requests
import pymysql
import pymongo
from pprint import pprint
"""
存入mysql
"""
# class Ali:
# def __init__(self):
# self.url = 'https://talent.alibaba.com/position/search?_csrf=8a2c0769-696d-4461-b25e-52985c7cb8c1'
# self.headers = {
# 'authority': 'talent.alibaba.com',
# 'accept': 'application/json, text/plain, */*',
# 'accept-language': 'zh-CN,zh;q=0.9',
# 'cache-control': 'no-cache',
# 'cookie': '',
# 'content-type': 'application/json',
# 'origin': 'https://talent.alibaba.com',
# 'referer': 'https://talent.alibaba.com/off-campus/position-list?lang=zh',
# 'user-agent': '',
# }
# self.params = {
# "_csrf": "8a2c0769-696d-4461-b25e-52985c7cb8c1",
# }
# self.db = pymysql.connect(host='localhost', user='root', password='12345', port=3306, db='python', charset='utf8')
# self.cursor = self.db.cursor()
#
# def get_data(self):
# for page in range(1, 10):
# data = {"channel": "group_official_site",
# "language": "zh",
# "batchId": "",
# "categories": "",
# "deptCodes": [],
# "key": "",
# "pageIndex": page,
# "pageSize": 10,
# "regions": "",
# "subCategories": ""}
#
# response = requests.post(self.url, headers=self.headers, params=self.params, json=data)
# result = response.json()
# # pprint(result)
# self.parse_data(result)
#
# def parse_data(self, result):
# for one_result in result['content']['datas']:
# categories = '' if not one_result['categories'] else one_result['categories'][0]
# department = one_result['department']
# description = one_result['description']
# experience = str(one_result['experience']['from']) if one_result['experience']['to'] == None else str(one_result['experience']['from'])+'~'+str(one_result['experience']['to'])
# name_ = one_result['name']
# positionUrl = 'https://talent.alibaba.com' + one_result['positionUrl']
# requirement = one_result['requirement']
# workLocations = one_result['workLocations'][0]
# self.save_data(categories, department, description, experience, name_, positionUrl, requirement, workLocations)
#
# def create_table(self):
# create_table_sql = """
# create table if not exists ali(
# id int unsigned primary key auto_increment,
# categories varchar(20),
# department varchar(10),
# description text,
# experience varchar(20),
# name_ varchar(60),
# positionUrl text,
# requirement text,
# workLocations varchar(5)
# )
# """
# try:
# self.cursor.execute(create_table_sql)
# print('创建表成功')
# except Exception as e:
# print('创建表失败', repr(e))
#
# def save_data(self, categories, department, description, experience, name_, positionUrl, requirement, workLocations):
# insert_sql = """
# insert into ali(id, categories, department, description, experience, name_, positionUrl, requirement, workLocations)
# values(%s, %s, %s, %s, %s, %s, %s, %s, %s)
# """
# try:
# self.cursor.execute(insert_sql, (0, categories, department, description, experience, name_, positionUrl, requirement, workLocations))
# self.db.commit()
# print('插入成功')
# except Exception as e:
# print('插入失败', repr(e))
#
# def main(self):
# self.create_table()
# self.get_data()
# self.db.close()
#
# if __name__ == '__main__':
# a = Ali()
# a.main()
"""
存入mongodb
"""
class Ali:
def __init__(self):
self.url = 'https://talent.alibaba.com/position/search?_csrf=8a2c0769-696d-4461-b25e-52985c7cb8c1'
self.headers = {
'authority': 'talent.alibaba.com',
'accept': 'application/json, text/plain, */*',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'no-cache',
'cookie': '',
'content-type': 'application/json',
'origin': 'https://talent.alibaba.com',
'referer': 'https://talent.alibaba.com/off-campus/position-list?lang=zh',
'user-agent': '',
}
self.params = {
"_csrf": "8a2c0769-696d-4461-b25e-52985c7cb8c1",
}
self.db = pymongo.MongoClient(host='localhost', port=27017)
self.collection = self.db.python.ali
def get_data(self):
for page in range(1, 10):
data = {"channel": "group_official_site",
"language": "zh",
"batchId": "",
"categories": "",
"deptCodes": [],
"key": "",
"pageIndex": page,
"pageSize": 10,
"regions": "",
"subCategories": ""}
response = requests.post(self.url, headers=self.headers, params=self.params, json=data)
result = response.json()
# pprint(result)
self.parse_data(result)
def parse_data(self, result):
for one_result in result['content']['datas']:
item = {}
item['categories'] = one_result['categories']
item['department'] = one_result['department']
item['description'] = one_result['description']
item['experience'] = one_result['experience']
item['name'] = one_result['name']
item['positionUrl'] = one_result['positionUrl']
item['requirement'] = one_result['requirement']
item['workLocations'] = one_result['workLocations']
timestamp = one_result['modifyTime']
item['modifyTime'] = time.strftime('%Y-%m-%d', time.localtime(timestamp / 1000)) # 13位时间戳转换要除以1000,10位时间戳转换不需要
self.save_data(item)
def save_data(self, item):
self.collection.insert_one(item)
print('插入成功')
def main(self):
self.get_data()
self.db.close()
if __name__ == '__main__':
a = Ali()
a.main()
案例三
import requests
import pymongo
from pprint import pprint
import time
from lxml import etree
class Bili:
def __init__(self):
self.url = 'https://api.bilibili.com/x/web-interface/wbi/search/type?'
self.headers = {
"cookie": "",
'user-agent': ''
}
self.db = pymongo.MongoClient(host='localhost', port=27017)
self.collection = self.db.python.bili
def get_data(self):
for page in range(1, 5):
params = {
'__refresh__': 'true',
'_extra': '',
'context': '',
'page': f'{page}',
'page_size': '42',
'from_source': '',
'from_spmid': '333.337',
'platform': 'pc',
'highlight': '1',
'single_column': '0',
'keyword': 'python',
'qv_id': '8qtKSUCRwYCYL1LJsaclQEWbcZfuytwD',
'ad_resource': '5654',
'source_tag': '3',
'gaia_vtoken': '',
'category_id': '',
'search_type': 'video',
'dynamic_offset': f'{24 * (page - 1)}',
'w_rid': 'bddf1da77e5731a008948e24a2394fa9',
'wts': '1680253326'
}
response = requests.get(self.url, headers=self.headers, params=params)
# pprint(response.json())
result = response.json()
self.parse_data(result)
def parse_data(self, result):
for one_result in result['data']['result']:
item = {}
item['arcurl'] = one_result['arcurl']
item['author'] = one_result['author']
duration = one_result['duration'].split(':')
hour = int(duration[0]) // 60
minute = int(duration[0]) % 60
second = int(duration[1])
item['duration'] = str(hour)+':'+str(minute)+':'+str(second)
item['play'] = one_result['play']
timestamp = one_result['pubdate']
item['pubdate'] = time.strftime('%Y-%m-%d', time.localtime(timestamp))
title = one_result['title']
element = etree.HTML(title)
item['title'] = ''.join(element.xpath('.//text()'))
item['video_review'] = one_result['video_review']
self.save_data(item)
def save_data(self, item):
self.collection.insert_one(item)
print('插入成功')
def main(self):
self.get_data()
self.db.close()
if __name__ == '__main__':
b = Bili()
b.main()
案例四
import requests
import json
import pymysql
import pymongo
from pprint import pprint
"""
存入mysql
"""
# class Aqili:
# def __init__(self):
# self.url = 'https://pcw-api.iqiyi.com/search/recommend/list?'
# self.headers = {
# 'cookie': ''
# }
# self.db = pymysql.connect(host='localhost', user='root', password='12345', port=3306, db='python', charset='utf8')
# self.cursor = self.db.cursor()
#
# def get_data(self):
# params = {
# 'channel_id': '2',
# 'data_type': '1',
# 'mode': '11',
# 'page_id': '2',
# 'ret_num': '48',
# 'session': 'fe7d9895c78e59c98c56203765c0b038',
# 'three_category_id': '15;must'
# }
# response = requests.get(self.url, headers=self.headers, params=params)
# result = response.json()
# # pprint(result)
# return result
#
# def parse_data(self, result):
# result_list = result['data']['list']
# for one_result_dict in result_list:
# description = one_result_dict['description'][:146]+'...' if len(one_result_dict['description']) > 150 else one_result_dict['description']
# focus = one_result_dict['focus']
# imageUrl = one_result_dict['imageUrl']
# title = one_result_dict['title']
# videoCount = one_result_dict['videoCount']
# score = one_result_dict['score']
# self.save_data(description, focus, imageUrl, title, videoCount, score)
#
# def create_table(self):
# create_table_sql = """
# create table if not exists aqili(
# id int unsigned primary key auto_increment,
# description text,
# focus varchar(50),
# imageUrl varchar(100),
# title varchar(30),
# videoCount int,
# score int
# )
# """
# try:
# self.cursor.execute(create_table_sql)
# print('创建表成功')
# except Exception as e:
# print('创建表失败', repr(e))
#
# def save_data(self, description, focus, imageUrl, title, videoCount, score):
# insert_sql = """
# insert into aqili(id, description, focus, imageUrl, title, videoCount, score)
# values(%s, %s, %s, %s, %s, %s, %s)
# """
# try:
# self.cursor.execute(insert_sql, (0, description, focus, imageUrl, title, videoCount, score))
# self.db.commit()
# print('插入成功')
# except Exception as e:
# print('插入失败', repr(e))
# self.db.rollback()
#
# def main(self):
# self.create_table()
# result = self.get_data()
# self.parse_data(result)
# self.db.close()
#
# if __name__ == '__main__':
# a = Aqili()
# a.main()
"""
存入mongodb
"""
class Aqili:
def __init__(self):
self.url = 'https://pcw-api.iqiyi.com/search/recommend/list?'
self.headers = {
'user-agent': ''
}
self.db = pymongo.MongoClient(host='localhost', port=27017)
self.collection = self.db.python.aqili
def get_data(self):
params = {
'channel_id': '2',
'data_type': '1',
'mode': '11',
'page_id': '3',
'ret_num': '48',
'session': 'fe7d9895c78e59c98c56203765c0b038',
'three_category_id': '15;must'
}
response = requests.get(self.url, headers=self.headers, params=params)
result = response.json()
# pprint(result)
return result
def parse_data(self, result):
for one_result in result['data']['list']:
item = {}
item['categories'] = one_result['categories']
item['description'] = one_result['description']
item['focus'] = one_result['focus']
item['people'] = [i['name'] for i in one_result['people']['main_charactor']]
item['score'] = one_result['score']
item['title'] = one_result['title']
item['videoCount'] = one_result['videoCount']
self.save_data(item)
def save_data(self, item):
self.collection.insert_one(item)
print('插入成功')
def main(self):
result = self.get_data()
self.parse_data(result)
if __name__ == '__main__':
a = Aqili()
a.main()
案例五
import requests
import pymongo
import pymysql
from lxml import etree
class MangGuoTV:
def __init__(self):
self.base_url = 'https://www.mgtv.com/lib/2?lastp=list_index&kind=a1&area=10&year=all&chargeInfo=a1&sort=c2'
self.headers = {
'User-Agent': ''
}
self.mongo_db = pymongo.MongoClient(host='localhost', port=27017)
self.collection = self.mongo_db.python.mgtv
self.mysql_db = pymysql.connect(host='localhost', user='root', password='12345', db='python', port=3306, charset='utf8')
self.cursor = self.mysql_db.cursor()
def parse_data1(self):
response = requests.get(self.base_url, headers=self.headers)
html = response.text
element = etree.HTML(html)
div_list = element.xpath('//div[@class="m-list clearfix"]/div')
for div in div_list:
item = {}
item['story'] = '空'
item['subtitle'] = ','.join([i for i in div.xpath('.//p[@class="u-desc"]/span//text()') if i != '/'])
item['title'] = div.xpath('.//p[@class="hitv_vertical-title"]/a/text()')[0]
item['updateInfo'] = div.xpath('.//div[@class="hitv_vertical-tag"]/p/text()')[0].strip('\n ') if len(div.xpath('.//div[@class="hitv_vertical-tag"]/p/text()'))!=0 else '空'
# print(item)
# break
self.mongo_save_data(item)
story = '空'
subtitle = ','.join([i for i in div.xpath('.//p[@class="u-desc"]/span//text()') if i != '/'])
title = div.xpath('.//p[@class="hitv_vertical-title"]/a/text()')[0]
updateInfo = div.xpath('.//div[@class="hitv_vertical-tag"]/p/text()')[0].strip('\n ') if len(div.xpath('.//div[@class="hitv_vertical-tag"]/p/text()'))!=0 else '空'
self.mysql_save_data(story, subtitle, title, updateInfo)
def get_data2(self):
for i in range(2, 12):
url = f'https://pianku.api.mgtv.com/rider/list/pcweb/v3?allowedRC=1&platform=pcweb&channelId=2&pn={i}&pc=80&hudong=1&_support=10000000&kind=a1&area=10&year=all&chargeInfo=a1&sort=c2'
response = requests.get(url, headers=self.headers)
result = response.json()
self.parse_data2(result)
def parse_data2(self, result):
for i in result['data']['hitDocs']:
item = {}
item['story'] = i['story']
item['subtitle'] = i['subtitle']
item['title'] = i['title']
item['updateInfo'] = i['updateInfo']
# print(item)
self.mongo_save_data(item)
story = i['story']
subtitle = i['subtitle']
title = i['title']
updateInfo = i['updateInfo']
self.mysql_save_data(story, subtitle, title, updateInfo)
def mongo_save_data(self, item):
self.collection.insert_one(item)
print('数据插入成功')
def create_table(self):
create_table_sql = '''
create table if not exists mgtv(
id int unsigned primary key auto_increment,
story text,
subtitle varchar(50),
title varchar(40),
updateInfo varchar(15)
)
'''
try:
self.cursor.execute(create_table_sql)
print('创建表成功')
except Exception as e:
print('创建表失败', e)
def mysql_save_data(self, story, subtitle, title, updateInfo):
insert_sql = '''
insert into mgtv(id, story, subtitle, title, updateInfo)
values(%s, %s, %s, %s, %s)
'''
try:
self.cursor.execute(insert_sql, (0, story, subtitle, title, updateInfo))
self.mysql_db.commit()
print('数据插入成功')
except Exception as e:
print('数据插入失败')
self.mysql_db.rollback()
def main(self):
self.create_table()
self.parse_data1()
self.get_data2()
self.mongo_db.close()
self.mysql_db.close()
if __name__ == '__main__':
mgtv = MangGuoTV()
mgtv.main()