爬取菜鸟教程的100例

import requests
from lxml import etree

base_url = 'https://www.runoob.com/python/python-exercise-example%s.html'


def get_element(url):
    headers = {
        'cookie': '__gads=Test; Hm_lvt_3eec0b7da6548cf07db3bc477ea905ee=1573454862,1573470948,1573478656,1573713819; Hm_lpvt_3eec0b7da6548cf07db3bc477ea905ee=1573714018; SERVERID=fb669a01438a4693a180d7ad8d474adb|1573713997|1573713863',
        'referer': 'https://www.runoob.com/python/python-100-examples.html',
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    return etree.HTML(response.text)


def write_py(i, text):
    with open('练习实例%s.py' % i, 'w', encoding='utf-8') as file:
        file.write(text)


def main():
    for i in range(1, 101):
        html = get_element(base_url % i)
        print(base_url%i)
        print(html)
        # // *[ @ id = "content"] / p[2] / text()
        content = '题目:' + html.xpath('//div[@id="content"]/p[2]/text()')[0] + '\n'
        print(content)
        fenxi = html.xpath('//div[@id="content"]/p[position()>=2]/text()')[0]
        daima = ''.join(html.xpath('//div[@class="hl-main"]/span/text()')) + '\n'
        haha = '"""\n' + content + fenxi + daima + '\n"""'
        # write_py(i, haha)
        # print(fenxi)

if __name__ == '__main__':
    main()


a表示的就是为文本末尾继续写入,w会进行覆盖

import requests
from lxml import html

url = "http://book.zongheng.com/chapter/952586/60520336.html"

header = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
}
page = requests.Session().get(url, headers=header)
# print(page.text)
# print(page)
# page.encoding='utf-8'
tree = html.fromstring(page.text)
# print(tree)
# print(page.text)
# /表示绝对路径  //表示相对路径  //*[@id="readerFt"]/div/div[5]   //*[@id="readerFt"]/div/div[5]/p[1]
result = tree.xpath('//*[@id="readerFt"]/div/div[5]/p/text()')
# //*[@id="monthTicketRankList"]/li[1]/a
for i in result:
    print(i)
    f=open('小说.txt','a')
    f.write(i+'\n')
# print(result)

https://blog.csdn.net/qq_40558166/article/details/102868801?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522162435676916780261949018%2522%252C%2522scm%2522%253A%252220140713.130102334%E2%80%A6%2522%257D&request_id=162435676916780261949018&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2alltop_positive~default-2-102868801.first_rank_v2_pc_rank_v29&utm_term=%E7%88%AC%E8%99%AB&spm=1018.2226.3001.4187

  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值