这个代码主要是为了锻炼对req对ests库和re库方法的熟练度。主要提取了书的书名、作者、出版社、价格等信息
# -*- coding: utf-8 -*-
import requests
import re
from requests.exceptions import RequestException
def get_one_page(url):
try:
r = requests.get(url)
if r.status_code == 200:
r.encoding= "utf-8"#防止乱码
return r.text
return None
except RequestException:
return None
def parse_html(html):
pattern = re.compile(r'<li.*?class="gl-item">.*?<a.*?<img.*?source-data-lazy-img="(.*?)" />.*?</a>'
r'.*?<div class="p-price">.*?/em><i>(.*?)</i>.*?<div class="p-name">.*?<a.*?title="(.*?)"'
r'.*?<em>([\u4E00-\u9FA5|a-zA-Z\s]*?)<font.*?>(.*?)</font>(.*?)</em>'
r'.*?<div class="p-bookdetails"><span class="p-bi-name".*?>(.*?)<a.*?>(.*?)</a>.*?</span>'
r'.*?<span class="p-bi-store".*?><a.*?>(.*?)</a>.*?</span>'
r'.*?<div class="p-commit">.*?<a.*?>(.*?)</a>(.*?)</strong>',re.S)
img = pattern.findall(html)
print(len(img))
for i in img:
print(i[1])#价格
print(i[3]+i[4]+i[5])#name
print(i[6],i[7])#author
print(i[8])#store
print(i[9],i[10])
print()
def main(keyword):
url = "https://search.jd.com/Search?keyword="+keyword
html = get_one_page(url)
parse_html(html)
if __name__ == '__main__':
keyword = input("请输入:")
main(keyword)
结果: