本文只是用于检测正则表达式书写
还有就是淘宝太难了,爬不了
import urllib.request
import urllib.parse
import json
import jsonpath
import re
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')
url_front = 'http://product.dangdang.com/index.php?r=comment%2Flist&productId=1204926048&categoryPath=01.41.26.21.00.00&mainProductId=1204926048&mediumId=0&pageIndex='
url_back = '&sortType=1&filterType=1&isSystem=1&tagId=0&tagFilterCount=0&template=publish&long_or_short=short'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0',
}
def handle_request(url):
request = urllib.request.Request(url=url, headers=headers)
return request
def get_response(request):
response = urllib.request.urlopen(request)
return response
def parse_json(json_text):
obj = json.loads(json_text)
ret = jsonpath.jsonpath(obj, '$.data.list.html')
return ret[0]
def main():
start_page = int(input('请输入起始页码:'))
end_page = int(input('请输入结束页码:'))
url = url_front + str(start_page) + url_back
json_text = get_response(handle_request(url=url)).read().decode('gbk')
ret = parse_json(json_text)
soup = BeautifulSoup(ret)
commet_list = soup.select('.item_wrap > div')
commet_list = str(commet_list)[1:-1]
pattern = re.compile(r'''<div class="comment_items clearfix">
.*?
<em>(.*?)</em>
.*?
<span><a href="(.*?)" target="_blank">(.*?)</a></span>
.*?
<span>(.*?)</span>
.*?
<div class="support" data-comment-id="(.*?)">
.*?
<a class="pic" href="javascript:"><img alt="(.*?)" src="(.*?)"/></a>
.*?''', re.S)
commet_list = pattern.findall(commet_list)
for commet in commet_list:
print(commet)
if __name__ == '__main__':
main()