cvtools.label_analysis.size_analysis 源代码

# -*- encoding:utf-8 -*-
# @Time    : 2019/9/9 20:46
# @Author  : gfjiang
# @Site    : 
# @File    : size_analysis.py
# @Software: PyCharm
from collections import defaultdict
import math
import numpy as np

import cvtools
from cvtools.cocotools.coco import COCO
from cvtools.utils.misc import sort_dict


[文档]class SizeAnalysis(object): """ for small objects: area < 32^2 for medium objects: 32^2 < area < 96^2 for large objects: area > 96^2 see http://cocodataset.org/#detection-eval and https://arxiv.org/pdf/1405.0312.pdf """ def __init__(self, coco, size_range=(32, 96)): if isinstance(coco, str): coco = COCO(coco) assert isinstance(coco, COCO) self.coco = coco self.size_range = size_range # self.low_limit = 0 # self.up_limit = 100000000 self.createIndex()
[文档] def createIndex(self): self.catToDatasets = [] catToImgs = sort_dict(self.coco.catToImgs) for cat, img_ids in catToImgs.items(): img_ids = set(img_ids) categories = [cat_info for cat_info in self.coco.dataset['categories'] if cat_info['id'] == cat] images = [img_info for img_info in self.coco.dataset['images'] if img_info['id'] in img_ids] annotations = [ann_info for ann_info in self.coco.dataset['annotations'] if ann_info['category_id'] == cat] self.catToDatasets.append( {'info': self.coco.dataset['info'], 'categories': categories, 'images': images, 'annotations': annotations} )
[文档] def stats_size_per_cat(self, to_file='size_per_cat_data.json'): self.cat_size = defaultdict(list) for cat_id, dataset in enumerate(self.catToDatasets): self.cat_size[dataset['categories'][0]['name']] = [ ann_info['bbox'][2]*ann_info['bbox'][2] for ann_info in dataset['annotations']] self.cat_size = dict( sorted(self.cat_size.items(), key=lambda item: len(item[1]))) g2_data = [] size_split1 = pow(self.size_range[0], 2) size_split2 = pow(self.size_range[1], 2) for cat_name, sizes in self.cat_size.items(): data_dict = dict() data_dict['Category'] = cat_name data_dict['small'] = len( [size for size in sizes if size < size_split1]) data_dict['medium'] = len( [size for size in sizes if size_split2 >= size > size_split1]) data_dict['large'] = len( [size for size in sizes if size > size_split2]) g2_data.append(data_dict) cvtools.dump_json(g2_data, to_file)
[文档] def stats_objs_per_img(self, to_file='stats_num.json'): total_anns = 0 imgToNum = defaultdict() for cat_id, ann_ids in self.coco.catToImgs.items(): imgs = set(ann_ids) total_anns += len(ann_ids) assert len(imgs) > 0 cat_name = self.coco.cats[cat_id]['name'] imgToNum[cat_name] = len(ann_ids) / float(len(imgs)) imgToNum['total'] = total_anns / float(len(self.coco.imgs)) print(imgToNum) cvtools.dump_json(imgToNum, to_file)
[文档] def stats_objs_per_cat(self, to_file='objs_per_cat_data.json'): cls_to_num = list() for cat_id in self.coco.catToImgs: item = dict() item['name'] = self.coco.cats[cat_id]['name'] item['value'] = len(self.coco.catToImgs[cat_id]) cls_to_num.append(item) cvtools.dump_json(cls_to_num, to_file=to_file)
# TODO: to fix
[文档] def get_weights_for_balanced_classes(self, to_file='weighted_samples.pkl'): count_per_class = [0.] * len(self.coco.cats) for cat_id, imgs in self.coco.catToImgs.items(): count_per_class[cat_id-1] = len(imgs) sort_ids = np.argsort(np.array(count_per_class)) # weight2_per_class = [0.] * len(self.coco.cats) # for cat_id, imgs in self.coco.catToImgs.items(): # weight2_per_class[cat_id-1] = len(set(imgs)) sum_objs = sum(count_per_class) weight_per_class = [0.] * len(self.coco.cats) for i in range(len(count_per_class)): weight_per_class[i] = sum_objs / count_per_class[i] # 至于分子是多少,无关紧要 sum_w = sum(weight_per_class) for i in range(len(weight_per_class)): weight_per_class[i] = weight_per_class[i] / sum_w # log_count_per_class = [math.log(c) for c in count_per_class] # sum_log_count_per_class = sum(log_count_per_class) # log_w = [sum_log_count_per_class / log_c for log_c in log_count_per_class] log_max_w = math.log(max(weight_per_class)) log_w = [math.log(w) / log_max_w for w in weight_per_class] sum_log_w = sum(log_w) for i in range(len(log_w)): log_w[i] = log_w[i] / sum_log_w from matplotlib import pyplot as plt plt.plot(np.array(count_per_class)[sort_ids], np.array([1/15.]*15), label='P=1/15') plt.plot(np.array(count_per_class)[sort_ids], np.array(weight_per_class)[sort_ids], label='N*P=${C_1}$', marker='o', ms=4) plt.plot(np.array(count_per_class)[sort_ids], np.array(log_w)[sort_ids][::-1], label='N*log(P)=${C_2}$', marker='+') plt.xlabel('Number of Category Instances') plt.ylabel('Sampling Probability') plt.legend(loc='best', shadow=False, fontsize=12) plt.rcParams['xtick.direction'] = 'in' plt.rcParams['ytick.direction'] = 'in' plt.savefig('E:/DL/cvtools/P-N.png', dpi=100) plt.show() plt.plot(np.array([1/15.]*15), label='P=1/15') plt.plot(np.array(weight_per_class)[sort_ids], label='N*P=${C_1}$', marker='o', ms=4) plt.plot(np.array(log_w)[sort_ids][::-1], marker='+', label='N*log(P)=${C_2}$',) plt.xlabel('Class Index') plt.ylabel('Sampling Probability') plt.legend(loc='best', shadow=False, fontsize=12) plt.rcParams['xtick.direction'] = 'in' plt.rcParams['ytick.direction'] = 'in' plt.savefig('E:/DL/cvtools/P-C.png', dpi=100) plt.show() weight = [0] * len(self.coco.imgs) for idx, anns in self.coco.imgToAnns.items(): weight[idx-1] = sum([weight_per_class[ann['category_id']-1] for ann in anns]) / len(anns) max_weight = max(weight) min_weight = min(weight) log_max_w = math.log(max_weight) log_weight = [math.log(w)/log_max_w for w in weight] linear_weight = [w/min_weight for w in weight] cvtools.draw_hist(np.sort(np.array(log_weight)), bins=20) cvtools.draw_hist(np.sort(np.array(weight)), bins=20) cvtools.draw_hist(np.sort(np.array(linear_weight)), bins=20) sum_log_weight = sum(log_weight) log_weight_p = list(map(lambda x: x/sum_log_weight, log_weight)) log_weight_p = np.sort(np.array(log_weight_p)) def bin_data_for_same_sample_num(x, bins=10): bin_len = int(len(x) / bins) data_bin = [] for bin_i in range(bins-1): data_bin.append(np.mean(x[bin_i*bin_len:(bin_i+1)*bin_len])) data_bin.append(np.mean(x[(bins - 1) * bin_len:])) return data_bin def bin_data_for_same_value_interval(x, bins=10): x = np.sort(x) min_x = x[0] max_x = x[-1] bin_len = (max_x - min_x) / float(bins) counts = [] means = [] last_inds = 0 for bin_value in np.arange(min_x+bin_len, max_x, bin_len): inds = 0 for a in x: if a > bin_value: break inds += 1 means.append(np.mean(x[last_inds:inds])) counts.append(inds - last_inds) last_inds = inds return means, counts means, counts = bin_data_for_same_value_interval(log_weight_p, bins=100) cvtools.draw_simple(x=means, y=counts) cvtools.save_pkl(weight, to_file=to_file + 'weights.pkl') cvtools.save_pkl(log_weight, to_file=to_file+'log_weights.pkl') cvtools.save_pkl(linear_weight, to_file=to_file + 'linear_weights.pkl') return weight
if __name__ == '__main__': size_analysis = SizeAnalysis('coco/instances_train2017.json') size_analysis.stats_size_per_cat(to_file='coco/size_per_cat_data.json') size_analysis.stats_objs_per_cat(to_file='coco/objs_per_cat_data.json')