前言
提示:以下是本篇文章正文内容,下面案例可供参考
一、抓取电影天堂电影下载链接(re的简单运用)
import requests
import re
import csv
domain="https://www.dytt89.com/"
resp=requests.get(domain,verify=False)
resp.encoding='gb2312'
#print(resp.text)
obj1=re.compile(r'2021必看热片.*?<ul>(?P<ul>.*?)</ul>',re.S)
obj2=re.compile(r"<a href='(?P<href>.*?)'",re.S)
obj3=re.compile(r'◎片 名 (?P<movie>.*?)<br />.*?<td'
r' style="WORD-WRAP: break-word" bgcolor="#fdfddf"><a href="(?P<download>.*?)">',re.S)
f = open("moviedata.csv", "a")
cvswriter = csv.writer(f)
#拿到ul里的li
result=obj1.finditer(resp.text)
child_href_list=[]
for it in result:
ul=it.group('ul')
# print(ul)
#提取子连接
result2=obj2.finditer(ul)
for itt in result2:
childhref=domain+itt.group("href").strip('/')
#print(childhref)
child_href_list.append(childhref)
#提取子页面内容
n=1
for href in child_href_list:
child_resp=requests.get(href,verify=False)
child_resp.encoding = 'gbk'
#print(child_resp.text)
#result3=obj3.search(child_resp.text)
# print(result3.group("movie"))
# print(result3.group("download"))
result3=obj3.finditer(child_resp.text)
for ittt in result3:
#print(ittt.group("movie"))
#print(ittt.group("download"))
dic=ittt.groupdict()
cvswriter.writerow(dic.values())
print('成功爬取%d个'%n)
n+=1
f.close()
二、抓取豆瓣网top的电影和评分(re的简单运用)
import requests
import re
import csv
for i in range(0,251,25):
url="https://movie.douban.com/top250?start="+str(i)+"&filter="
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36 Edg/94.0.992.31'
}
resp=requests.get(url,headers=headers)
page_content=resp.text;
#print(page_content)
#解析数据
obj=re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<name>.*?)'
r'</span>.*?<p class="">.*?<br>(?P<year>.*?) .*? '
r'<span class="rating_num" property="v:average">(?P<star>.*?)</span>.*?<span>(?P<score>.*?)人评价</span>',re.S)
#开始匹配
result=obj.finditer(page_content)
f=open("data.csv",mode="a")
csvwriter=csv.writer(f)
for it in result:
print(it.group("name"))
print(it.group("year").strip())
print(it.group("star").strip())
print(it.group("score").strip())
dic=it.groupdict()
dic['year']=dic['year'].strip()
csvwriter.writerow(dic.values())
f.close()
print("over")
三、抓取猪八戒网saas项目接单信息(lxml etree的简单运用)
import requests
from lxml import etree
import csv
url="https://wuhan.zbj.com/search/f/?kw=saas"
resp=requests.get(url)
# print(resp.text)
html=etree.HTML(resp.text)
divs=html.xpath("/html/body/div[6]/div/div/div[2]/div[5]/div[1]/div")
f=open("zbj.csv","a")
for div in divs:
price=div.xpath('./div/div/a/div[2]/div[1]/span[1]/text()')[0].strip('¥')
title = "saas".join(div.xpath('./div/div/a/div[2]/div[2]/p/text()'))
com_name=div.xpath("./div/div/a/div[1]/p/text()")[1].strip('\n')
location=div.xpath("./div/div/a/div[1]/div/span/text()")[0]
# print(title+" "+price+" "+com_name+" "+location)
print("("+'"'+title+'"'+","+'"'+com_name+'"'+","+'"'+price+'"'+","+'"'+location+'"'+"),")
#f.write(title+" "+title+" "+com_name+" "+location+"\n")