“Good Design Award”创立于1957年,也是日本国内唯一综合性的设计评价与推荐制度,通称为G-mark,中文称之为日本优良设计大奖。
一个日本设计网站的获奖数据爬取,获奖数据很齐全,国内访问有时候会访问失败!
爬取需求:
获取各个年份的获奖作品名称,描述及图片!
应用 python类 来构建一个简单的爬虫系统:
包括
爬取模块:gmspider.py
存储模块:store.py
下载模块:down.py
还有个运行程序:main.py
对于数据的衔接以及异常处理花费了不少时间和精力,自行尝试过后更相信可以认识到scrapy框架的强大之处!
几个关键点:
1.爬取数据的存储
第一:写入本地txt文件;
第二:写入本地mysql数据库。
2.数据模块之间的衔接处理
这里还是应用了队列,Queue,应用队列来获取传输数据;
如果是scrapy,数据管道会非常方便,没有就只能自己硬写了!
3.关于数据的提取下载
思考了两个方案:
方案一:一边爬取一边保存插入数据到数据库,再同时下载图片数据。
方案而.:爬取数据的同时保存插入数据到数据库,最后从数据库里查询数据,从而下载数据图片,此方案某些程度可以规避某些时段网络差的问题。
4.实际下载网络超时的重试,以及异常处理
网络超时的处理:
from requests.adapters import HTTPAdapter #引入 HTTPAdapter 库 s = requests.Session() s.mount('http://', HTTPAdapter(max_retries=3)) #重试3次 s.mount('https://', HTTPAdapter(max_retries=3)) #重试3次 try: r= s.get(img_url,timeout=20,headers=self.headers) with open(f'{path}/{img_name}', 'wb') as f: f.write(r.content) print(f">>> 下载 {img_name} 图片成功!") except requests.exceptions.RequestException as e:
三次重试机会,再加上首次获取,总共四次!
运行效果:
异常的处理
try except 以及异常情况日志的写入,这里是写入到本地的txt!
还是感觉代码繁多,以及各种异常情况很难有合适的方式处理!
5.关于etree解析网页的处理技巧
获取节点数据的时候,打印输出节点html源码是一个不错的方法。
如何将xpath定位到的元素进行转为HTML源码
#方法1:使用from lxml.html import tostring的tostring方法功能
from lxml.html import tostring from lxml import etree html_get = etree.HTML(resp_text) div_ok = html_get.xpath('//div[@id="mw-content-text"]')[0] div_content = tostring(div_ok).decode('utf-8') #方法2(推荐使用,经过我效率测试,使用etree返回的html使用xpath定位到的元素,还使用etree转换为HTML源码效率更快): from lxml import etree html_get = etree.HTML(resp_text) div_ok = html_get.xpath('//div[@id="mw-content-text"]')[0] print(div_ok,type(div_ok)) div_content = etree.tostring(div_ok, pretty_print=True, method='html').decode('utf-8') # 转为字符串 #来源CSDN博主「奋斗吧-皮卡丘」
xpath获取数据需要多次尝试,一步步获取,输出查看效果!
最终还是看个人熟练程度,还是不熟练啊!
运行效果:
项目:
爬取效果:
附上源码
爬取模块 gmspider.py
import requests from fake_useragent import UserAgent from lxml import etree import re,time from queue import Queue from requests.adapters import HTTPAdapter #引入 HTTPAdapter 库 class Gspider(object): def __init__(self,year): self.ua=UserAgent() self.headers={'User-Agent':self.ua.random} self.url="https://www.g-mark.org" self.year=str(year) self.datas=Queue(1000) #获取列表页数据 def parse_list(self): url="%s/award/search?"%self.url params={ 'from': self.year, 'to': '', 'prizeCode': '', 'keyword': '', 'locale': 'en', } response=requests.get(url,params=params,headers=self.headers,timeout=10).content.decode('utf-8') time.sleep(1) html=etree.HTML(response) sections=html.xpath('//section[@class="prizeArea section"]') print(len(sections)) for section in sections: category=section.xpath('*//img[1]/@alt')[0] category = re.sub(r'[\|\/\<\>\:\*\?\\\"]', '_', category) # 格式化奖项分类 category=str(category) print(category) #print(etree.tostring(section).decode('utf-8')) page_urls=section.xpath('*//li/a[1]/@href') print(category,page_urls) self.parse_pages(category, page_urls) datas_num = self.datas.qsize() print(f'>>> 获取{self.year}年获奖作品数据完成!') print(f'>>> 共{datas_num}条数据!') return self.datas #获取奖项分类详情页数据 def parse_pages(self,category,page_list): for page_url in page_list: page_url = '%s%s%s' % (self.url, page_url, '&locale=en') print(page_url) try: data=self.parse_page(category,page_url) self.datas.put(data) except Exception as e: print(f'获取数据失败,错误代码:{e}') with open('page_spider.txt', 'a+', encoding='utf-8') as f: now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) f.write(f'{category,}-{page_url}-{self.year}-获取数据失败,错误代码:{e}-{now}\n') #获取详情页数据 def parse_page(self,category,page_url): try: response = requests.get(page_url, headers=self.headers, timeout=10).content.decode('utf-8') html = etree.HTML(response) title = html.xpath('//dl[@class="basicinfo"]/dd[@class="item"]/text()')[0] title = re.sub(r'[\|\/\<\>\:\*\?\'\\."]', '_', title) # 获取标题 print(title) detail = html.xpath('//dl[@class="detail"]/dd[1]/p/text()')[0] # 获取简介 # print(detail) imgs = html.xpath('//figure[@id="mainphoto"]/ul[@class="photo"]/li/a/img/@src') # print(imgs) data = (category, title, imgs, self.year, detail) print(data) except Exception as e: if "Read timed out" in str(e): print(f'>>> 获取{page_url}数据超时,正在重试...') time.sleep(2) s = requests.Session() s.mount('http://', HTTPAdapter(max_retries=3)) #重试3次 s.mount('https://', HTTPAdapter(max_retries=3)) #重试3次 try: response= s.get(page_url,timeout=20,headers=self.headers).content.decode('utf-8') html = etree.HTML(response) title = html.xpath('//dl[@class="basicinfo"]/dd[@class="item"]/text()')[0] title = re.sub(r'[\|\/\<\>\:\*\?\'\\."]', '_', title) # 获取标题 print(title) detail = html.xpath('//dl[@class="detail"]/dd[1]/p/text()')[0] # 获取简介 # print(detail) imgs = html.xpath('//figure[@id="mainphoto"]/ul[@class="photo"]/li/a/img/@src') # print(imgs) data = (category, title, imgs, self.year, detail) print(data) except requests.exceptions.RequestException as e: print(f'获取数据失败,错误代码:{e}') with open('pageurl_spider.txt', 'a+', encoding='utf-8') as f: now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) f.write(f'{category,}-{page_url}-{self.year}-获取{page_url}失败,错误代码:{e}-{now}\n') else: print(f'获取数据失败,错误代码:{e}') with open('pageurl_spider.txt', 'a+', encoding='utf-8') as f: now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) f.write(f'{category,}-{page_url}-{self.year}-获取{page_url}失败,错误代码:{e}-{now}\n') return data
最初写的版本,按年份爬取数据
# -*- coding: UTF-8 -*- import requests from fake_useragent import UserAgent from lxml import etree import re,time from requests.adapters import HTTPAdapter #引入 HTTPAdapter 库 from queue import Queue class Gspider(object): def __init__(self,year): self.ua=UserAgent() self.headers={'User-Agent':self.ua.random} self.url="https://www.g-mark.org" self.year=str(year) self.datas=Queue(300) self.page_urls=[] def timeout_get(self,url): s = requests.Session() s.mount('http://', HTTPAdapter(max_retries=3)) s.mount('https://', HTTPAdapter(max_retries=3)) try: response = s.get(url, headers=self.headers,timeout=20) except requests.exceptions.RequestException as e: print(f'访问失败,错误代码:{e}') with open('spider.txt','a+',encoding='utf-8') as f: now=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) f.write(f'{url}-{e}-{now}') return response def parse_list(self): url="%s/award/search?"%self.url params={ 'from': self.year, 'to': '', 'prizeCode': '', 'keyword': '', } response=requests.get(url,params=params,headers=self.headers,timeout=10).content.decode('utf-8') time.sleep(1) html=etree.HTML(response) hrefs=html.xpath('//section/ul/li/a[1]/@href') hrefs_num=(len(hrefs)) for href in hrefs: page_url='%s%s'%(self.url,href) self.page_urls.append(page_url) print(f'>>> 获取{self.year}年获奖作品链接完成!') print(f'>>> 共{hrefs_num}条链接!') def parse_pages(self): self.parse_list() for page_url in self.page_urls: print(page_url) try: data=self.parse_page(page_url) self.datas.put(data) except Exception as e: print(f'获取数据失败,错误代码:{e}') with open('spider.txt', 'a+', encoding='utf-8') as f: now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) f.write(f'{page_url}-{self.year}-获取数据失败,错误代码:{e}-{now}\n') break datas_num = self.datas.qsize() print(f'>>> 获取{self.year}年获奖作品数据完成!') print(f'>>> 共{datas_num}条数据!') return self.datas def parse_page(self,page_url): response = requests.get(page_url, headers=self.headers, timeout=10).content.decode('utf-8') time.sleep(2) html = etree.HTML(response) title=html.xpath('//dl[@class="basicinfo"]/dd[@class="item"]/text()')[0] title = re.sub(r'[\|\/\<\>\:\*\?\\\"]', '_', title) #获取标题 #print(title) detail =html.xpath('//dl[@class="detail"]/dd[1]/p/text()')[0] #获取简介 #print(detail) imgs=html.xpath('//figure[@id="mainphoto"]/ul[@class="photo"]/li/a/img/@src') #print(imgs) data=(title,imgs,self.year,detail) print(data) return data if __name__=='__main__': spider=Gspider(2018) spider.parse_pages()
存储模块 store.py
import pymysql import os import time class Save(object): def __init__(self,data): self.host="localhost" self.user="root" self.password="123456" self.db="xiaoshuo" self.port=3306 self.connect = pymysql.connect( host=self.host, user=self.user, password=self.password, db=self.db, port=self.port, ) self.cursor = self.connect.cursor() # 设置游标 self.data=data def insert(self): category,title,imgs,year,detail=self.data imgs=','.join(imgs) print(imgs) try: sql="INSERT INTO g_mark(category,title,imgs,cyear,detail)VALUES(%s,%s,%s,%s,%s)" val = (category,title,imgs,year,detail) self.cursor.execute(sql, val) self.connect.commit() print(f'>>> 插入 {title} 数据成功!') except Exception as e: print(f'>>> 插入 {title} 数据失败!') with open('spider.txt', 'a+', encoding='utf-8') as f: now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) f.write(f'{category},{title},{imgs},{year},{detail}-插入数据失败,错误代码:{e}-{now}\n') pass def cs(self): # 关闭数据库 self.cursor.close() self.connect.close() def save_text(self): category,title,imgs,year,detail = self.data os.makedirs(f'{year}/{category}',exist_ok=True) #创建目录 texts='%s%s%s'%(title,'\n',detail) try: with open(f'{year}/{category}/{title}.txt','w',encoding='utf-8') as f: f.write(texts) print(f'>>> 保存 {title}.txt 文本内容成功!') except Exception as e: print(f'>>> 插入 {title} 数据失败!') with open('spider.txt', 'a+', encoding='utf-8') as f: now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) f.write(f'{category},{title},{imgs},{year},{detail}-保存数据失败,错误代码:{e}-{now}\n') pass def sav(self): self.insert() self.save_text() self.cs()
数据库表:
数据库表结构:
title字段Unique索引去重:
下载模块 down.py
import requests import os from fake_useragent import UserAgent import time import pymysql from requests.adapters import HTTPAdapter #引入 HTTPAdapter 库 class Gdown(object): def __init__(self,data): self.data=data self.ua=UserAgent() self.headers={'User-Agent':self.ua.random} #传入数据下载 def date_downs(self): category, title, imgs, year, detail = self.data path=f'{year}/{category}' os.makedirs(path,exist_ok=True) i = 1 for img in imgs: img_url=img.split('?')[0] print(img_url) suffix=os.path.splitext(img_url)[1] #获取图片文件格式(后缀) img_name='%s%s%s%s'%(title,'_',i,suffix) print(img_name) self.get_img(img_url, img_name, path) i=i+1 #下载图片 def get_img(self,img_url,img_name,path): try: r=requests.get(img_url,headers=self.headers,timeout=15) time.sleep(2) with open(f'{path}/{img_name}','wb') as f: f.write(r.content) print(f">>> 下载 {img_name} 图片成功!") except Exception as e: if "Read timed out" in str(e): print(f'>>> 下载{img_name}图片超时,正在重试...') time.sleep(2) s = requests.Session() s.mount('http://', HTTPAdapter(max_retries=3)) #重试3次 s.mount('https://', HTTPAdapter(max_retries=3)) #重试3次 try: r= s.get(img_url,timeout=20,headers=self.headers) with open(f'{path}/{img_name}', 'wb') as f: f.write(r.content) print(f">>> 下载 {img_name} 图片成功!") except requests.exceptions.RequestException as e: print(f">>> 下载 {img_name} 图片失败,错误代码:{e}!") now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) with open(f'{path}/img_spider.txt', 'a+', encoding='utf-8') as f: f.write(f'{img_url},{img_name},{path}-下载图片失败,错误代码:{e}-{now}\n') else: print(f">>> 下载 {img_name} 图片失败,错误代码:{e}!") now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) with open(f'{path}/img_spider.txt','a+',encoding='utf-8') as f: f.write(f'{img_url},{img_name},{path}-下载图片失败,错误代码:{e}-{now}\n') #从数据库查询数据 下载图片数据 def sql_downs(self): #连接数据库 connect = pymysql.connect( host="localhost", user="root", password="123456", db="xiaoshuo", port=3306, ) cursor = connect.cursor() # 设置游标 #查询数据库 cursor.execute("select category, title, imgs, cyear from g_mark") datas = cursor.fetchall() # 关闭数据库 cursor.close() connect.close() print(len(datas)) for category, title, imgs, cyear in datas: path = f'{cyear}/{category}' os.makedirs(path, exist_ok=True) i = 1 img_urls = imgs.split(',') for img in img_urls: img_url = img.split('?')[0] print(img_url) suffix = os.path.splitext(img_url)[1] # 获取图片文件格式(后缀) img_name = '%s%s%s%s' % (title, '_', i, suffix) print(img_name) self.get_img(img_url, img_name, path) i = i + 1
运行程序 main.py
from gmspider import Gspider #引入爬虫模块 from store import Save #引入存储模块 from down import Gdown #引入下载模块 import time if __name__=='__main__': for year in range(2010, 2020): print(f'>>> 正在爬取{year}年数据..') spider = Gspider(year) datas=spider.parse_list() while True: if datas.empty(): break data=datas.get() print(data) try: save = Save(data) save.sav() except Exception as e: print(f'插入/保存数据失败,错误代码:{e}') now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) with open(f'run_sql_spider.txt', 'a+', encoding='utf-8') as f: f.write(f'{data}-插入/保存数据失败,错误代码:{e}-{now}\n') try: downs=Gdown(data) downs.date_downs() except Exception as e: print(f'下载图片数据失败,错误代码:{e}') now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) with open(f'run_img_spider.txt', 'a+', encoding='utf-8') as f: f.write(f'{data}-下载图片数据失败,错误代码:{e}-{now}\n')
以上是完整记录及过程整理,哦,不对,吹了个牛皮,
如果,
有更好的爬取思维及处理思路,不妨与我沟通交流!
感谢!
个人微信:huguo00289
项目文件打包:
链接:
https://pan.baidu.com/s/1X_Ib4eAngSTTDUBZSeRuJA
提取码:
s84i