电影天堂的种子爬取(数据获取不全面,存在bug望各位指点)

import requests
from lxml import etree
import pymysql
from urllib import parse
class MysqlHelper(object):
def __init__(self):
self.db = pymysql.connect(host='127.0.0.1', port=3306, user='root', password='123456', database='py11', charset='utf8')
self.cursor = self.db.cursor()

def execute_modify_sql(self,sql, data):
self.cursor.execute(sql, data)
self.db.commit()

def __del__(self):
self.cursor.close()
self.db.close()

sql = 'insert into dy(url_list,tr_list) values (%s,%s)'
mc = MysqlHelper()

headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'UM_distinctid=1654b8ae4b08db-0313750a9b0b41-9393265-1fa400-1654b8ae4b2679; XLA_CI=73872247efe6cc02697c8a3cacbe04fa; cscpvcouplet4298_fidx=4; CNZZDATA1260535040=665870404-1534569071-%7C1534671676; cscpvrich5041_fidx=3',
'Host': 'www.dytt8.net',
'If-Modified-Since': 'Thu, 09 Aug 2018 15:38:45 GMT',
'If-None-Match': '"8070a9ef72fd41:328"',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
#主页
for i in range(1,4):
url = 'http://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'.format(i)

response = requests.get(url, headers=headers)

# with open('dytt8.html', 'wb') as f:
# f.write(response.content)

html_ele = etree.HTML(response.text)

# print(html_ele)

tr_list = html_ele.xpath('//div[@class = "co_content8"]/ul/td/table')
# print(tr_list)
# url_list = parse.urljoin(url, tr_list)
# print(url_list)
#详情页
for tr_ele in tr_list:
# print(tr_ele)
tr_ele_a = tr_ele.xpath('./tr[2]/td[2]/b/a/@href')[0]
# print(tr_ele_a)
url_list = parse.urljoin(url, tr_ele_a)
# print(url_list)
response_a = requests.get(url_list, headers=headers)
# print(response_a)

html_ele_a = etree.HTML(response_a.content.decode('gbk'))
# print(html_ele_a)

tr_list = html_ele_a.xpath('//div[@class="bd3r"]/div[2]/div[1]/h1/font/text()')

data = (url_list, tr_list)
mc.execute_modify_sql(sql, data)

转载于:https://www.cnblogs.com/luwanhe/p/9502896.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值