Uimaker是为UI设计师提供学UI设计的专业UI平台,拥有UI教程、UI素材、ICON、图标设计UI、手机UI、ui设计师招聘、软件界面设计、后台界面、后台模版等相关内容,快来uimaker学UI设计。
目标网站:
uimaker.com/uimakerdown/list_36_1.html
爬取思路:
第一步:或缺素材页码总页数
第二步:爬取素材列表链接
第三步:爬取素材详情
python爬取采集关键点:
1.页码总数格式化处理
replace函数和split函数
pagenum=pagenum_url.replace(".html",'').split('_')[-1]
2.内容详情格式化处理
article=req.xpath('//div[@class="contentinfo"]/table//text()') article =''.join(article) article =article.strip()
3.是否需要u币下载,这里作了判断
b_num=req.xpath('//div[@class="download"]/dl[@class="downlink"]/dd[1]/b/text()')[0] if int(b_num)==0:
4.特殊情况,无此链接信息的处理
try: down_url=req.xpath('//div[@class="download"]/dl[@class="downlink"]/dt/li/a/@href')[0] down_name=f'{h2}/{h2}.rar' print(down_url,down_name) self.down(down_url,down_name) except Exception as e: print("无此链接信息!")
运行情况:
运行效果:
完整源码:
#uimaker素材获取 #20200310 by 微信:huguo00289 # -*- coding: UTF-8 -*- import requests,time,os from lxml import etree from fake_useragent import UserAgent class Uimaker(): #初始化变量 def __init__(self): self.ua=UserAgent() self.headers={"User-Agent":self.ua.random} #获取页码 def get_pagenum(self): url="http://www.uimaker.com/uimakerdown/" response=requests.get(url,headers=self.headers,timeout=6).content.decode("gbk") req=etree.HTML(response) pagenum_url=req.xpath('//div[@class="page"]/ul[@class="pagelist"]/li')[-1].xpath('.//a/@href')[0] pagenum=pagenum_url.replace(".html",'').split('_')[-1] pagenum=int(pagenum) print(pagenum) return pagenum #获取列表链接 def get_urllist(self,i): url=f"http://www.uimaker.com/uimakerdown/list_36_{i}.html" print(url) response = requests.get(url, headers=self.headers,timeout=6).content.decode("gbk") req = etree.HTML(response) urllist = req.xpath('//dl[@class="imglist"]/dt/ul[@class="listimg"]/li/span[@class="listpic"]/a/@href') print(len(urllist)) print(urllist) return urllist #获取素材详情 def get_dowm(self,url): response = requests.get(url,headers=self.headers,timeout=6).content.decode("gbk") req = etree.HTML(response) h2 = req.xpath('//div[@class="arcinfo"]/h2/text()')[0] print(h2) os.makedirs(f'{h2}/',exist_ok=True) article=req.xpath('//div[@class="contentinfo"]/table//text()') article =''.join(article) article =article.strip() print(article) texts=f'{h2}\n{article}' self.get_text(h2,texts) imgs=req.xpath('//div[@class="contentinfo"]/table//@src') if imgs: i=1 for img in imgs: img_url=f'http://www.uimaker.com{img}' suffix=os.path.splitext(img)[1] img_name=f'{i}{suffix}' print(img_url,img_name) self.get_downimg(h2, img_url, img_name) i=i+1 b_num=req.xpath('//div[@class="download"]/dl[@class="downlink"]/dd[1]/b/text()')[0] if int(b_num)==0: try: down_url=req.xpath('//div[@class="download"]/dl[@class="downlink"]/dt/li/a/@href')[0] down_name=f'{h2}/{h2}.rar' print(down_url,down_name) self.down(down_url,down_name) except Exception as e: print("无此链接信息!") print(f'>>>获取素材成功!') #保存文本内容 def get_text(self,h2,texts): print("开始保存文本内容...") with open(f'{h2}/{h2}.txt', 'w',encoding="utf-8") as f: f.write(texts) print(">>>保存文本内容完成!") #下载图片 def get_downimg(self,h2,img_url,img_name): print("开始下载图片...") r = requests.get(img_url, headers=self.headers, timeout=6) with open(f'{h2}/{img_name}', 'wb') as f: f.write(r.content) print(">>>下载素材完成!") #下载素材 def down(self,down_url,down_name): print("开始下载素材...") r=requests.get(down_url,headers=self.headers,timeout=6) with open(down_name,'wb') as f: f.write(r.content) print(">>>下载素材完成!") #运行程序 def run(self): pagenum=self.get_pagenum() for i in range(1,pagenum+1): urllist=self.get_urllist(i) for url in urllist: self.get_dowm(url) if __name__=='__main__': spider=Uimaker() spider.run()