# 13. 数据集大小 / 特征数 / 问题POI数

``````import pickle

print len(enron_data)

print "######### 特征数"
for features in enron_data.values():
print len(features)
print features
break

print "######### 有问题的poi数"
i = 0
for key,value in enron_data.items():
if (value["poi"] == 1):
i = i + 1
print i
``````

``````146
#########
21
{'salary': 365788, 'to_messages': 807, 'deferral_payments': 'NaN', 'total_payments': 1061827, 'exercised_stock_options': 'NaN', 'bonus': 600000, 'restricted_stock': 585062, 'shared_receipt_with_poi': 702, 'restricted_stock_deferred': 'NaN', 'total_stock_value': 585062, 'expenses': 94299, 'loan_advances': 'NaN', 'from_messages': 29, 'other': 1740, 'from_this_person_to_poi': 1, 'poi': False, 'director_fees': 'NaN', 'deferred_income': 'NaN', 'long_term_incentive': 'NaN', 'email_address': 'mark.metts@enron.com', 'from_poi_to_this_person': 38}
#########
18
``````

# 18. 特定人员的股票数目

``````for key,value in enron_data.items():
if (key == "PRENTICE JAMES"):
print key
print value["total_stock_value"]

# this method is more simple
print enron_data["PRENTICE JAMES"]["total_stock_value"]
``````

# 28. 字典到数组的转换

``````import numpy as np

def featureFormat( dictionary, features, remove_NaN=True, remove_all_zeroes=True, remove_any_zeroes=False, sort_keys = False):
""" convert dictionary to numpy array of features
remove_NaN = True will convert "NaN" string to 0.0
remove_all_zeroes = True will omit any data points for which
all the features you seek are 0.0
remove_any_zeroes = True will omit any data points for which
any of the features you seek are 0.0
sort_keys = True sorts keys by alphabetical order. Setting the value as
a string opens the corresponding pickle file with a preset key
order (this is used for Python 3 compatibility, and sort_keys
should be left as False for the course mini-projects).
NOTE: first feature is assumed to be 'poi' and is not checked for
removal for zero or missing values.
"""

return_list = []

# Key order - first branch is for Python 3 compatibility on mini-projects,
# second branch is for compatibility on final project.
if isinstance(sort_keys, str):
import pickle
elif sort_keys:
keys = sorted(dictionary.keys())
else:
keys = dictionary.keys()

for key in keys:
tmp_list = []
for feature in features:
try:
dictionary[key][feature]
except KeyError:
print "error: key ", feature, " not present"
return
value = dictionary[key][feature]
if value=="NaN" and remove_NaN:
value = 0
tmp_list.append( float(value) )

# Logic for deciding whether or not to add the data point.
append = True
# exclude 'poi' class as criteria.
if features[0] == 'poi':
test_list = tmp_list[1:]
else:
test_list = tmp_list
### if all features are zero and you want to remove
### data points that are all zero, do that here
if remove_all_zeroes:
append = False
for item in test_list:
if item != 0 and item != "NaN":
append = True
break
### if any features for a given data point are zero
### and you want to remove data points with any zeroes,
### handle that here
if remove_any_zeroes:
if 0 in test_list or "NaN" in test_list:
append = False
### Append the data point if flagged for addition.
if append:
return_list.append( np.array(tmp_list) )

return np.array(return_list)

def targetFeatureSplit( data ):
"""
given a numpy array like the one returned from
featureFormat, separate out the first feature
and put it into its own list (this should be the
quantity you want to predict)

return targets and features as separate lists

(sklearn can generally handle both lists and numpy arrays as
input formats when training/predicting)
"""

target = []
features = []
for item in data:
target.append( item[0] )
features.append( item[1:] )

return target, features
``````