import requests
from lxml import etree
import re
import pymysql
# 定义url
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
# 连接数据库
db = pymysql.connect(host='127.0.0.1', port=3306, user='root', password='123456', database='python', charset='utf8')
# 创建光标
cursor = db.cursor()
# 获取三次分页
for p in range(3):
url = 'http://www.ygdy8.com/html/gndy/dyzz/list_23_{}.html'
# 循环一次 页码加一
# print(p)
p+=1
url = url.format(p)
print(url)
# 发送请求
req = requests.get(url,headers=headers)
# 强制定义编码
req.encoding ='gb2312'
# 格式化数据
response = req.text
# print(response)
# with open('dianying.html','wb' ) as f:
# f.write(req.content)
# 获取网页的obj
html_obj = etree.HTML(response)
# 定位数据
html_list = html_obj.xpath('//div[@class="co_content8"]/ul/td/table/tr[2]/td[2]/b/a/@href')
# print(html_list)
# 循环列表 拿出每条数据
for i in html_list:
# print(i)
# 拼接url
url_b = 'http://www.ygdy8.com{}'.format(i)
# print(url)
# 发送请求
req = requests.get(url_b, headers=headers)
# with open('dianying.html', 'wb') as f:
# f.write(response.content)
# 定义编码
req.encoding = 'gb2312'
# 转换格式
response = req.text
# 正则匹配
href =re.search(r'href="(.*)"><strong><font', response).group(1)
title =re.search(r'<font color=#07519a>(.*)</font></h1></div>', response).group(1)
# print(href)
print(title)
sql = '''insert into dianying values (null,'{}','{}')'''.format(title,href)
cursor.execute(sql)
db.commit()
cursor.close()
db.close()
python爬取阳光电影保存mysql
最新推荐文章于 2023-05-25 17:57:58 发布