#Basic python library which need to import
import pandas as pd
import numpy as np
#Date stuff
from datetime import datetime
from datetime import timedelta
#Library for Nice graphing
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as sn
%matplotlib inline
#Library for statistics operation
import scipy.stats as stats
# Date Time library
from datetime import datetime
#Machine learning Library
import statsmodels.api as sm
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error, mean_squared_error
# Ignore warnings
import warnings
# Settings
pd.set_option('display.max_columns', None)
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
# Importing training data set
train = pd.read_csv("train.csv")
#Import Test Data
# Import Store data set
stores = pd.read_csv("stores.csv")
# Now import features data set
feature = pd.read_csv("features.csv")
# For Train data set
train_bt = pd.merge(train,stores)
train = pd.merge(train_bt,feature)
#For test data set
test_bt = pd.merge(test,stores)
test= pd.merge(test_bt,feature)
Store | Dept | Date | Weekly_Sales | IsHoliday | Type | Size | Temperature | Fuel_Price | MarkDown1 | MarkDown2 | MarkDown3 | MarkDown4 | MarkDown5 | CPI | Unemployment | |
0 | 1 | 1 | 2010-02-05 | 24924.50 | False | A | 151315 | 42.31 | 2.572 | NaN | NaN | NaN | NaN | NaN | 211.096358 | 8.106 |
1 | 1 | 2 | 2010-02-05 | 50605.27 | False | A | 151315 | 42.31 | 2.572 | NaN | NaN | NaN | NaN | NaN | 211.096358 | 8.106 |
Store | Dept | Date | IsHoliday | Type | Size | Temperature | Fuel_Price | MarkDown1 | MarkDown2 | MarkDown3 | MarkDown4 | MarkDown5 | CPI | Unemployment | |
0 | 1 | 1 | 2012-11-02 | False | A | 151315 | 55.32 | 3.386 | 6766.44 | 5147.7 | 50.82 | 3639.9 | 2737.42 | 223.462779 | 6.573 |
1 | 1 | 2 | 2012-11-02 | False | A | 151315 | 55.32 | 3.386 | 6766.44 | 5147.7 | 50.82 | 3639.9 | 2737.42 | 223.462779 | 6.573 |
print (train.info())
print ("*****************************************")
print (test.info())
<class 'pandas.core.frame.DataFrame'> Int64Index: 421570 entries, 0 to 421569 Data columns (total 16 columns): Store 421570 non-null int64 Dept 421570 non-null int64 Date 421570 non-null object Weekly_Sales 421570 non-null float64 IsHoliday 421570 non-null bool Type 421570 non-null object Size 421570 non-null int64 Temperature 421570 non-null float64 Fuel_Price 421570 non-null float64 MarkDown1 150681 non-null float64 MarkDown2 111248 non-null float64 MarkDown3 137091 non-null float64 MarkDown4 134967 non-null float64 MarkDown5 151432 non-null float64 CPI 421570 non-null float64 Unemployment 421570 non-null float64 dtypes: bool(1), float64(10), int64(3), object(2) memory usage: 51.9+ MB None ***************************************** <class 'pandas.core.frame.DataFrame'> Int64Index: 115064 entries, 0 to 115063 Data columns (total 15 columns): Store 115064 non-null int64 Dept 115064 non-null int64 Date 115064 non-null object IsHoliday 115064 non-null bool Type 115064 non-null object Size 115064 non-null int64 Temperature 115064 non-null float64 Fuel_Price 115064 non-null float64 MarkDown1 114915 non-null float64 MarkDown2 86437 non-null float64 MarkDown3 105235 non-null float64 MarkDown4 102176 non-null float64 MarkDown5 115064 non-null float64 CPI 76902 non-null float64 Unemployment 76902 non-null float64 dtypes: bool(1), float64(9), int64(3), object(2) memory usage: 13.3+ MB None
# tale only those values whose sales is positive.
train = train[train['Weekly_Sales']>0]
numeric_var_train=[key for key in dict(train.dtypes) if dict(train.dtypes)[key] in ['float64', 'int64', 'float32', 'int32']]
cat_var_train=[key for key in dict(train.dtypes) if dict(train.dtypes)[key] in ['object']]
# Train Numerical Data
# Train Categorical Data
print (numeric_var_train)
print (cat_var_train)
['Store', 'Dept', 'Weekly_Sales', 'Size', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment'] ['Date', 'Type']
# Use a general function that returns multiple values
def var_summary(x):
return pd.Series([x.count(), x.isnull().sum(), x.sum(), x.mean(), x.median(), x.std(), x.var(), x.min(), x.dropna().quantile(0.01), x.dropna().quantile(0.05),x.dropna().quantile(0.10),x.dropna().quantile(0.25),x.dropna().quantile(0.50),x.dropna().quantile(0.75), x.dropna().quantile(0.90),x.dropna().quantile(0.95), x.dropna().quantile(0.99),x.max()],
index=['N', 'NMISS', 'SUM', 'MEAN','MEDIAN', 'STD', 'VAR', 'MIN', 'P1' , 'P5' ,'P10' ,'P25' ,'P50' ,'P75' ,'P90' ,'P95' ,'P99' ,'MAX'])
num_summary=train_num.apply(lambda x: var_summary(x)).T
def cat_summary(x):
return pd.Series([x.count(), x.isnull().sum(), x.value_counts()],
index=['N', 'NMISS', 'ColumnsNames'])
cat_summary=train_cat.apply(lambda x: cat_summary(x))
Date | Type | |
N | 420212 | 420212 |
NMISS | 0 | 0 |
ColumnsNames | 2011-12-23 3018 2011-11-25 3016 2011-12-... | A 214961 B 162787 C 42464 Name: Type... |
numeric_var_test=[key for key in dict(test.dtypes) if dict(test.dtypes)[key] in ['float64', 'int64', 'float32', 'int32']]
cat_var_test=[key for key in dict(test.dtypes) if dict(test.dtypes)[key] in ['object']]
# Train Numerical Data
# Train Categorical Data
print (numeric_var_test)
print (cat_var_test)
['Store', 'Dept', 'Size', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment'] ['Date', 'Type']
# Numerical data summary report
num_summary=test_num.apply(lambda x: var_summary(x)).T
N | NMISS | SUM | MEAN | MEDIAN | STD | VAR | MIN | P1 | P5 | P10 | P25 | P50 | P75 | P90 | P95 | P99 | MAX | |
Store | 115064.0 | 0.0 | 2.558817e+06 | 22.238207 | 22.000 | 12.809930 | 1.640943e+02 | 1.000 | 1.000 | 3.000 | 5.000 | 11.000 | 22.000 | 33.000 | 40.000 | 43.000 | 45.000 | 45.000 |
Dept | 115064.0 | 0.0 | 5.101883e+06 | 44.339524 | 37.000 | 30.656410 | 9.398155e+02 | 1.000 | 1.000 | 4.000 | 7.000 | 18.000 | 37.000 | 74.000 | 92.000 | 95.000 | 98.000 | 99.000 |
Size | 115064.0 | 0.0 | 1.570597e+10 | 136497.688921 | 140167.000 | 61106.926438 | 3.734056e+09 | 34875.000 | 34875.000 | 39690.000 | 39910.000 | 93638.000 | 140167.000 | 202505.000 | 204184.000 | 206302.000 | 219622.000 | 219622.000 |
Temperature | 115064.0 | 0.0 | 6.206760e+06 | 53.941804 | 54.470 | 18.724153 | 3.505939e+02 | -7.290 | 11.440 | 23.980 | 29.970 | 39.820 | 54.470 | 67.350 | 79.480 | 83.820 | 92.140 | 101.950 |
Fuel_Price | 115064.0 | 0.0 | 4.121070e+05 | 3.581546 | 3.606 | 0.239442 | 5.733244e-02 | 2.872 | 2.957 | 3.161 | 3.227 | 3.431 | 3.606 | 3.766 | 3.866 | 3.951 | 4.079 | 4.125 |
# categorical data summary report
def cat_summary(x):
return pd.Series([x.count(), x.isnull().sum(), x.value_counts()],
index=['N', 'NMISS', 'ColumnsNames'])
cat_summary=test_cat.apply(lambda x: cat_summary(x))
Date | Type | |
N | 115064 | 115064 |
NMISS | 0 | 0 |
ColumnsNames | 2012-12-21 3002 2012-12-07 2989 2012-12-... | A 58713 B 44500 C 11851 Name: Type, d... |
# Run Pandas profilingto see the over all report
import pandas_profiling
Dataset info
Number of variables | 16 |
Number of observations | 421570 |
Total Missing (%) | 21.1% |
Total size in memory | 51.9 MiB |
Average record size in memory | 129.0 B |
Variables types
Numeric | 13 |
Categorical | 2 |
Boolean | 1 |
Date | 0 |
Text (Unique) | 0 |
Rejected | 0 |
Unsupported | 0 |
has a high cardinality: 143 distinct values WarningMarkDown1
has 270889 / 64.3% missing values MissingMarkDown2
has 310322 / 73.6% missing values MissingMarkDown3
has 284479 / 67.5% missing values MissingMarkDown4
has 286603 / 68.0% missing values MissingMarkDown5
has 270138 / 64.1% missing values MissingCPI
Distinct count | 2145 |
Unique (%) | 0.5% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 171.2 |
Minimum | 126.06 |
Maximum | 227.23 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | 126.06 |
5-th percentile | 126.5 |
Q1 | 132.02 |
Median | 182.32 |
Q3 | 212.42 |
95-th percentile | 221.94 |
Maximum | 227.23 |
Range | 101.17 |
Interquartile range | 80.394 |
Descriptive statistics
Standard deviation | 39.159 |
Coef of variation | 0.22873 |
Kurtosis | -1.8297 |
Mean | 171.2 |
MAD | 38.066 |
Skewness | 0.085219 |
Sum | 72174000 |
Variance | 1533.4 |
Memory size | 6.4 MiB |
Value | Count | Frequency (%) | |
129.8555333 | 711 | 0.2% |
131.1083333 | 708 | 0.2% |
129.84596670000002 | 707 | 0.2% |
130.38490320000002 | 706 | 0.2% |
130.683 | 706 | 0.2% |
131.0756667 | 706 | 0.2% |
130.6457931 | 706 | 0.2% |
130.7196333 | 705 | 0.2% |
130.4546207 | 705 | 0.2% |
129.98454840000002 | 704 | 0.2% |
Other values (2135) | 414506 | 98.3% |
Minimum 5 values
Value | Count | Frequency (%) | |
126.064 | 678 | 0.2% |
126.0766452 | 679 | 0.2% |
126.08545159999998 | 675 | 0.2% |
126.08929029999999 | 682 | 0.2% |
126.1019355 | 686 | 0.2% |
Maximum 5 values
Value | Count | Frequency (%) | |
227.01841659999997 | 69 | 0.0% |
227.0369359 | 70 | 0.0% |
227.16939190000002 | 63 | 0.0% |
227.21428799999998 | 62 | 0.0% |
227.2328068 | 63 | 0.0% |
Distinct count | 143 |
Unique (%) | 0.0% |
Missing (%) | 0.0% |
Missing (n) | 0 |
2011-12-23 |
2011-11-25 |
2011-12-16 |
Other values (140) |
Value | Count | Frequency (%) | |
2011-12-23 | 3027 | 0.7% |
2011-11-25 | 3021 | 0.7% |
2011-12-16 | 3013 | 0.7% |
2011-12-09 | 3010 | 0.7% |
2012-02-17 | 3007 | 0.7% |
2011-12-30 | 3003 | 0.7% |
2012-02-10 | 3001 | 0.7% |
2011-12-02 | 2994 | 0.7% |
2012-03-02 | 2990 | 0.7% |
2012-10-12 | 2990 | 0.7% |
Other values (133) | 391514 | 92.9% |
Distinct count | 81 |
Unique (%) | 0.0% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 44.26 |
Minimum | 1 |
Maximum | 99 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | 1 |
5-th percentile | 4 |
Q1 | 18 |
Median | 37 |
Q3 | 74 |
95-th percentile | 95 |
Maximum | 99 |
Range | 98 |
Interquartile range | 56 |
Descriptive statistics
Standard deviation | 30.492 |
Coef of variation | 0.68893 |
Kurtosis | -1.2156 |
Mean | 44.26 |
MAD | 26.537 |
Skewness | 0.35822 |
Sum | 18658822 |
Variance | 929.77 |
Memory size | 6.4 MiB |
Value | Count | Frequency (%) | |
1 | 6435 | 1.5% |
10 | 6435 | 1.5% |
38 | 6435 | 1.5% |
21 | 6435 | 1.5% |
67 | 6435 | 1.5% |
16 | 6435 | 1.5% |
14 | 6435 | 1.5% |
13 | 6435 | 1.5% |
79 | 6435 | 1.5% |
81 | 6435 | 1.5% |
Other values (71) | 357220 | 84.7% |
Minimum 5 values
Value | Count | Frequency (%) | |
1 | 6435 | 1.5% |
2 | 6435 | 1.5% |
3 | 6435 | 1.5% |
4 | 6435 | 1.5% |
5 | 6347 | 1.5% |
Maximum 5 values
Value | Count | Frequency (%) | |
95 | 6435 | 1.5% |
96 | 4854 | 1.2% |
97 | 6278 | 1.5% |
98 | 5836 | 1.4% |
99 | 862 | 0.2% |
Distinct count | 892 |
Unique (%) | 0.2% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 3.361 |
Minimum | 2.472 |
Maximum | 4.468 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | 2.472 |
5-th percentile | 2.653 |
Q1 | 2.933 |
Median | 3.452 |
Q3 | 3.738 |
95-th percentile | 4.029 |
Maximum | 4.468 |
Range | 1.996 |
Interquartile range | 0.805 |
Descriptive statistics
Standard deviation | 0.45851 |
Coef of variation | 0.13642 |
Kurtosis | -1.1854 |
Mean | 3.361 |
MAD | 0.4032 |
Skewness | -0.1049 |
Sum | 1416900 |
Variance | 0.21024 |
Memory size | 6.4 MiB |
Value | Count | Frequency (%) | |
3.638 | 2548 | 0.6% |
3.63 | 2164 | 0.5% |
2.7710000000000004 | 1917 | 0.5% |
3.891 | 1856 | 0.4% |
3.594 | 1796 | 0.4% |
3.5239999999999996 | 1793 | 0.4% |
3.523 | 1792 | 0.4% |
2.72 | 1790 | 0.4% |
3.6660000000000004 | 1778 | 0.4% |
2.78 | 1656 | 0.4% |
Other values (882) | 402480 | 95.5% |
Minimum 5 values
Value | Count | Frequency (%) | |
2.472 | 38 | 0.0% |
2.513 | 45 | 0.0% |
2.5140000000000002 | 906 | 0.2% |
2.52 | 39 | 0.0% |
2.533 | 42 | 0.0% |
Maximum 5 values
Value | Count | Frequency (%) | |
4.294 | 363 | 0.1% |
4.301 | 360 | 0.1% |
4.308 | 168 | 0.0% |
4.449 | 358 | 0.1% |
4.468 | 368 | 0.1% |
Distinct count | 2 |
Unique (%) | 0.0% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Mean | 0.070358 |
True |
(Missing) |
Value | Count | Frequency (%) | |
True | 29661 | 7.0% |
(Missing) | 391909 | 93.0% |
Distinct count | 2278 |
Unique (%) | 0.5% |
Missing (%) | 64.3% |
Missing (n) | 270889 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 7246.4 |
Minimum | 0.27 |
Maximum | 88647 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | 0.27 |
5-th percentile | 149.19 |
Q1 | 2240.3 |
Median | 5347.4 |
Q3 | 9210.9 |
95-th percentile | 21801 |
Maximum | 88647 |
Range | 88646 |
Interquartile range | 6970.6 |
Descriptive statistics
Standard deviation | 8291.2 |
Coef of variation | 1.1442 |
Kurtosis | 17.606 |
Mean | 7246.4 |
MAD | 5262.8 |
Skewness | 3.3418 |
Sum | 1091900000 |
Variance | 68744000 |
Memory size | 6.4 MiB |
Value | Count | Frequency (%) | |
1.5 | 102 | 0.0% |
460.73 | 102 | 0.0% |
175.64 | 93 | 0.0% |
1282.42 | 75 | 0.0% |
9264.48 | 75 | 0.0% |
686.24 | 75 | 0.0% |
5924.71 | 75 | 0.0% |
1483.17 | 75 | 0.0% |
3242.59 | 74 | 0.0% |
10671.71 | 74 | 0.0% |
Other values (2267) | 149861 | 35.5% |
(Missing) | 270889 | 64.3% |
Minimum 5 values
Value | Count | Frequency (%) | |
0.27 | 51 | 0.0% |
0.5 | 49 | 0.0% |
1.5 | 102 | 0.0% |
1.94 | 50 | 0.0% |
2.12 | 52 | 0.0% |
Maximum 5 values
Value | Count | Frequency (%) | |
62567.6 | 66 | 0.0% |
65021.23 | 73 | 0.0% |
75149.79 | 73 | 0.0% |
78124.5 | 70 | 0.0% |
88646.76 | 68 | 0.0% |
Distinct count | 1500 |
Unique (%) | 0.4% |
Missing (%) | 73.6% |
Missing (n) | 310322 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 3334.6 |
Minimum | -265.76 |
Maximum | 104520 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | -265.76 |
5-th percentile | 1.95 |
Q1 | 41.6 |
Median | 192 |
Q3 | 1926.9 |
95-th percentile | 16497 |
Maximum | 104520 |
Range | 104790 |
Interquartile range | 1885.3 |
Descriptive statistics
Standard deviation | 9475.4 |
Coef of variation | 2.8415 |
Kurtosis | 37.59 |
Mean | 3334.6 |
MAD | 4690.4 |
Skewness | 5.4413 |
Sum | 370970000 |
Variance | 89782000 |
Memory size | 6.4 MiB |
Value | Count | Frequency (%) | |
1.91 | 539 | 0.1% |
3.0 | 493 | 0.1% |
0.5 | 485 | 0.1% |
1.5 | 471 | 0.1% |
4.0 | 367 | 0.1% |
6.0 | 365 | 0.1% |
7.64 | 354 | 0.1% |
3.82 | 353 | 0.1% |
5.73 | 345 | 0.1% |
19.0 | 345 | 0.1% |
Other values (1489) | 107131 | 25.4% |
(Missing) | 310322 | 73.6% |
Minimum 5 values
Value | Count | Frequency (%) | |
-265.76 | 71 | 0.0% |
-192.0 | 72 | 0.0% |
-20.0 | 72 | 0.0% |
-10.98 | 60 | 0.0% |
-10.5 | 143 | 0.0% |
Maximum 5 values
Value | Count | Frequency (%) | |
82881.16 | 73 | 0.0% |
89121.94 | 74 | 0.0% |
92523.94 | 73 | 0.0% |
97740.99 | 73 | 0.0% |
104519.54 | 72 | 0.0% |
Distinct count | 1663 |
Unique (%) | 0.4% |
Missing (%) | 67.5% |
Missing (n) | 284479 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 1439.4 |
Minimum | -29.1 |
Maximum | 141630 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | -29.1 |
5-th percentile | 0.65 |
Q1 | 5.08 |
Median | 24.6 |
Q3 | 103.99 |
95-th percentile | 1059.9 |
Maximum | 141630 |
Range | 141660 |
Interquartile range | 98.91 |
Descriptive statistics
Standard deviation | 9623.1 |
Coef of variation | 6.6854 |
Kurtosis | 77.688 |
Mean | 1439.4 |
MAD | 2578.1 |
Skewness | 8.3995 |
Sum | 197330000 |
Variance | 92604000 |
Memory size | 6.4 MiB |
Value | Count | Frequency (%) | |
3.0 | 754 | 0.2% |
6.0 | 710 | 0.2% |
2.0 | 660 | 0.2% |
1.0 | 611 | 0.1% |
0.22 | 487 | 0.1% |
0.5 | 463 | 0.1% |
0.01 | 444 | 0.1% |
4.0 | 439 | 0.1% |
3.2 | 379 | 0.1% |
1.98 | 363 | 0.1% |
Other values (1652) | 131781 | 31.3% |
(Missing) | 284479 | 67.5% |
Minimum 5 values
Value | Count | Frequency (%) | |
-29.1 | 72 | 0.0% |
-1.0 | 70 | 0.0% |
-0.87 | 46 | 0.0% |
-0.2 | 69 | 0.0% |
0.0 | 67 | 0.0% |
Maximum 5 values
Value | Count | Frequency (%) | |
89402.64 | 71 | 0.0% |
101378.79 | 73 | 0.0% |
103991.94 | 72 | 0.0% |
109030.75 | 75 | 0.0% |
141630.61 | 74 | 0.0% |
Distinct count | 1945 |
Unique (%) | 0.5% |
Missing (%) | 68.0% |
Missing (n) | 286603 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 3383.2 |
Minimum | 0.22 |
Maximum | 67475 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | 0.22 |
5-th percentile | 28.76 |
Q1 | 504.22 |
Median | 1481.3 |
Q3 | 3595 |
95-th percentile | 12646 |
Maximum | 67475 |
Range | 67475 |
Interquartile range | 3090.8 |
Descriptive statistics
Standard deviation | 6292.4 |
Coef of variation | 1.8599 |
Kurtosis | 29.997 |
Mean | 3383.2 |
MAD | 3329.7 |
Skewness | 4.8475 |
Sum | 456620000 |
Variance | 39594000 |
Memory size | 6.4 MiB |
Value | Count | Frequency (%) | |
9.0 | 280 | 0.1% |
4.0 | 200 | 0.0% |
2.0 | 197 | 0.0% |
3.0 | 146 | 0.0% |
47.0 | 143 | 0.0% |
67.72 | 142 | 0.0% |
17.0 | 141 | 0.0% |
657.56 | 141 | 0.0% |
8.0 | 140 | 0.0% |
1330.36 | 140 | 0.0% |
Other values (1934) | 133297 | 31.6% |
(Missing) | 286603 | 68.0% |
Minimum 5 values
Value | Count | Frequency (%) | |
0.22 | 57 | 0.0% |
0.41 | 52 | 0.0% |
0.46 | 48 | 0.0% |
0.78 | 52 | 0.0% |
0.87 | 49 | 0.0% |
Maximum 5 values
Value | Count | Frequency (%) | |
52739.02 | 72 | 0.0% |
53603.99 | 72 | 0.0% |
57815.43 | 68 | 0.0% |
57817.56 | 74 | 0.0% |
67474.85 | 72 | 0.0% |
Distinct count | 2294 |
Unique (%) | 0.5% |
Missing (%) | 64.1% |
Missing (n) | 270138 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 4629 |
Minimum | 135.16 |
Maximum | 108520 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | 135.16 |
5-th percentile | 715.52 |
Q1 | 1878.4 |
Median | 3359.4 |
Q3 | 5563.8 |
95-th percentile | 11269 |
Maximum | 108520 |
Range | 108380 |
Interquartile range | 3685.4 |
Descriptive statistics
Standard deviation | 5962.9 |
Coef of variation | 1.2882 |
Kurtosis | 107.85 |
Mean | 4629 |
MAD | 2989.8 |
Skewness | 8.1699 |
Sum | 700970000 |
Variance | 35556000 |
Memory size | 6.4 MiB |
Value | Count | Frequency (%) | |
2743.18 | 136 | 0.0% |
1064.56 | 120 | 0.0% |
9083.54 | 75 | 0.0% |
20371.02 | 75 | 0.0% |
3567.03 | 75 | 0.0% |
4180.29 | 75 | 0.0% |
3557.67 | 75 | 0.0% |
986.23 | 74 | 0.0% |
1773.53 | 74 | 0.0% |
14660.97 | 74 | 0.0% |
Other values (2283) | 150579 | 35.7% |
(Missing) | 270138 | 64.1% |
Minimum 5 values
Value | Count | Frequency (%) | |
135.16 | 65 | 0.0% |
153.04 | 47 | 0.0% |
153.9 | 49 | 0.0% |
164.08 | 52 | 0.0% |
170.64 | 69 | 0.0% |
Maximum 5 values
Value | Count | Frequency (%) | |
58068.14 | 69 | 0.0% |
63005.58 | 69 | 0.0% |
85851.87 | 68 | 0.0% |
105223.11 | 70 | 0.0% |
108519.28 | 68 | 0.0% |
Distinct count | 40 |
Unique (%) | 0.0% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 136730 |
Minimum | 34875 |
Maximum | 219622 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | 34875 |
5-th percentile | 39690 |
Q1 | 93638 |
Median | 140170 |
Q3 | 202500 |
95-th percentile | 206300 |
Maximum | 219622 |
Range | 184747 |
Interquartile range | 108870 |
Descriptive statistics
Standard deviation | 60981 |
Coef of variation | 0.446 |
Kurtosis | -1.2063 |
Mean | 136730 |
MAD | 52517 |
Skewness | -0.32585 |
Sum | 57640387438 |
Variance | 3718600000 |
Memory size | 6.4 MiB |
Value | Count | Frequency (%) | |
39690 | 20802 | 4.9% |
39910 | 20597 | 4.9% |
203819 | 20376 | 4.8% |
219622 | 10474 | 2.5% |
126512 | 10315 | 2.4% |
205863 | 10272 | 2.4% |
151315 | 10244 | 2.4% |
202307 | 10238 | 2.4% |
204184 | 10225 | 2.4% |
158114 | 10224 | 2.4% |
Other values (30) | 287803 | 68.3% |
Minimum 5 values
Value | Count | Frequency (%) | |
34875 | 8999 | 2.1% |
37392 | 9036 | 2.1% |
39690 | 20802 | 4.9% |
39910 | 20597 | 4.9% |
41062 | 6751 | 1.6% |
Maximum 5 values
Value | Count | Frequency (%) | |
204184 | 10225 | 2.4% |
205863 | 10272 | 2.4% |
206302 | 10113 | 2.4% |
207499 | 10062 | 2.4% |
219622 | 10474 | 2.5% |
Distinct count | 45 |
Unique (%) | 0.0% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 22.201 |
Minimum | 1 |
Maximum | 45 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | 1 |
5-th percentile | 3 |
Q1 | 11 |
Median | 22 |
Q3 | 33 |
95-th percentile | 43 |
Maximum | 45 |
Range | 44 |
Interquartile range | 22 |
Descriptive statistics
Standard deviation | 12.785 |
Coef of variation | 0.5759 |
Kurtosis | -1.1465 |
Mean | 22.201 |
MAD | 10.996 |
Skewness | 0.077763 |
Sum | 9359084 |
Variance | 163.46 |
Memory size | 6.4 MiB |
Value | Count | Frequency (%) | |
13 | 10474 | 2.5% |
10 | 10315 | 2.4% |
4 | 10272 | 2.4% |
1 | 10244 | 2.4% |
2 | 10238 | 2.4% |
24 | 10228 | 2.4% |
27 | 10225 | 2.4% |
34 | 10224 | 2.4% |
20 | 10214 | 2.4% |
6 | 10211 | 2.4% |
Other values (35) | 318925 | 75.7% |
Minimum 5 values
Value | Count | Frequency (%) | |
1 | 10244 | 2.4% |
2 | 10238 | 2.4% |
3 | 9036 | 2.1% |
4 | 10272 | 2.4% |
5 | 8999 | 2.1% |
Maximum 5 values
Value | Count | Frequency (%) | |
41 | 10088 | 2.4% |
42 | 6953 | 1.6% |
43 | 6751 | 1.6% |
44 | 7169 | 1.7% |
45 | 9637 | 2.3% |
Distinct count | 3528 |
Unique (%) | 0.8% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 60.09 |
Minimum | -2.06 |
Maximum | 100.14 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | -2.06 |
5-th percentile | 27.31 |
Q1 | 46.68 |
Median | 62.09 |
Q3 | 74.28 |
95-th percentile | 87.27 |
Maximum | 100.14 |
Range | 102.2 |
Interquartile range | 27.6 |
Descriptive statistics
Standard deviation | 18.448 |
Coef of variation | 0.307 |
Kurtosis | -0.63592 |
Mean | 60.09 |
MAD | 15.377 |
Skewness | -0.3214 |
Sum | 25332000 |
Variance | 340.33 |
Memory size | 6.4 MiB |
Value | Count | Frequency (%) | |
50.43 | 709 | 0.2% |
67.87 | 646 | 0.2% |
72.62 | 594 | 0.1% |
76.67 | 583 | 0.1% |
70.28 | 563 | 0.1% |
76.03 | 555 | 0.1% |
50.56 | 544 | 0.1% |
64.05 | 542 | 0.1% |
64.21 | 519 | 0.1% |
50.81 | 487 | 0.1% |
Other values (3518) | 415828 | 98.6% |
Minimum 5 values
Value | Count | Frequency (%) | |
-2.06 | 69 | 0.0% |
5.54 | 68 | 0.0% |
6.23 | 69 | 0.0% |
7.46 | 69 | 0.0% |
9.51 | 70 | 0.0% |
Maximum 5 values
Value | Count | Frequency (%) | |
99.2 | 46 | 0.0% |
99.22 | 185 | 0.0% |
99.66 | 48 | 0.0% |
100.07 | 46 | 0.0% |
100.14 | 44 | 0.0% |
Distinct count | 3 |
Unique (%) | 0.0% |
Missing (%) | 0.0% |
Missing (n) | 0 |
A |
B |
C |
Value | Count | Frequency (%) | |
A | 215478 | 51.1% |
B | 163495 | 38.8% |
C | 42597 | 10.1% |
Distinct count | 349 |
Unique (%) | 0.1% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 7.9603 |
Minimum | 3.879 |
Maximum | 14.313 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | 3.879 |
5-th percentile | 5.326 |
Q1 | 6.891 |
Median | 7.866 |
Q3 | 8.572 |
95-th percentile | 12.187 |
Maximum | 14.313 |
Range | 10.434 |
Interquartile range | 1.681 |
Descriptive statistics
Standard deviation | 1.8633 |
Coef of variation | 0.23407 |
Kurtosis | 2.7312 |
Mean | 7.9603 |
MAD | 1.283 |
Skewness | 1.1837 |
Sum | 3355800 |
Variance | 3.4719 |
Memory size | 6.4 MiB |
Value | Count | Frequency (%) | |
8.099 | 5152 | 1.2% |
8.163 | 3636 | 0.9% |
7.852 | 3614 | 0.9% |
7.343 | 3416 | 0.8% |
7.057 | 3414 | 0.8% |
7.931 | 3400 | 0.8% |
7.441 | 3397 | 0.8% |
6.565 | 3370 | 0.8% |
8.2 | 3361 | 0.8% |
6.891 | 3360 | 0.8% |
Other values (339) | 385450 | 91.4% |
Minimum 5 values
Value | Count | Frequency (%) | |
3.8789999999999996 | 287 | 0.1% |
4.077 | 938 | 0.2% |
4.125 | 1831 | 0.4% |
4.145 | 562 | 0.1% |
4.156000000000001 | 1815 | 0.4% |
Maximum 5 values
Value | Count | Frequency (%) | |
13.975 | 1529 | 0.4% |
14.020999999999999 | 2263 | 0.5% |
14.099 | 2441 | 0.6% |
14.18 | 2423 | 0.6% |
14.312999999999999 | 2636 | 0.6% |
Distinct count | 359464 |
Unique (%) | 85.3% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 15981 |
Minimum | -4988.9 |
Maximum | 693100 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | -4988.9 |
5-th percentile | 59.975 |
Q1 | 2079.7 |
Median | 7612 |
Q3 | 20206 |
95-th percentile | 61202 |
Maximum | 693100 |
Range | 698090 |
Interquartile range | 18126 |
Descriptive statistics
Standard deviation | 22711 |
Coef of variation | 1.4211 |
Kurtosis | 21.491 |
Mean | 15981 |
MAD | 15161 |
Skewness | 3.262 |
Sum | 6737200000 |
Variance | 515800000 |
Memory size | 6.4 MiB |
Value | Count | Frequency (%) | |
10.0 | 353 | 0.1% |
5.0 | 289 | 0.1% |
20.0 | 232 | 0.1% |
15.0 | 215 | 0.1% |
12.0 | 175 | 0.0% |
1.0 | 169 | 0.0% |
10.47 | 167 | 0.0% |
11.97 | 154 | 0.0% |
2.0 | 148 | 0.0% |
7.0 | 146 | 0.0% |
Other values (359454) | 419522 | 99.5% |
Minimum 5 values
Value | Count | Frequency (%) | |
-4988.94 | 1 | 0.0% |
-3924.0 | 1 | 0.0% |
-1750.0 | 1 | 0.0% |
-1699.0 | 1 | 0.0% |
-1321.48 | 1 | 0.0% |
Maximum 5 values
Value | Count | Frequency (%) | |
474330.1 | 1 | 0.0% |
627962.93 | 1 | 0.0% |
630999.19 | 1 | 0.0% |
649770.18 | 1 | 0.0% |
693099.36 | 1 | 0.0% |
Store | Dept | Date | Weekly_Sales | IsHoliday | Type | Size | Temperature | Fuel_Price | MarkDown1 | MarkDown2 | MarkDown3 | MarkDown4 | MarkDown5 | CPI | Unemployment | |
0 | 1 | 1 | 2010-02-05 | 24924.50 | False | A | 151315 | 42.31 | 2.572 | NaN | NaN | NaN | NaN | NaN | 211.096358 | 8.106 |
1 | 1 | 2 | 2010-02-05 | 50605.27 | False | A | 151315 | 42.31 | 2.572 | NaN | NaN | NaN | NaN | NaN | 211.096358 | 8.106 |
2 | 1 | 3 | 2010-02-05 | 13740.12 | False | A | 151315 | 42.31 | 2.572 | NaN | NaN | NaN | NaN | NaN | 211.096358 | 8.106 |
3 | 1 | 4 | 2010-02-05 | 39954.04 | False | A | 151315 | 42.31 | 2.572 | NaN | NaN | NaN | NaN | NaN | 211.096358 | 8.106 |
4 | 1 | 5 | 2010-02-05 | 32229.38 | False | A | 151315 | 42.31 | 2.572 | NaN | NaN | NaN | NaN | NaN | 211.096358 | 8.106 |
Dataset info
Number of variables | 15 |
Number of observations | 115064 |
Total Missing (%) | 7.4% |
Total size in memory | 13.3 MiB |
Average record size in memory | 121.0 B |
Variables types
Numeric | 12 |
Categorical | 2 |
Boolean | 1 |
Date | 0 |
Text (Unique) | 0 |
Rejected | 0 |
Unsupported | 0 |
Distinct count | 361 |
Unique (%) | 0.3% |
Missing (%) | 33.2% |
Missing (n) | 38162 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 176.96 |
Minimum | 131.24 |
Maximum | 228.98 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | 131.24 |
5-th percentile | 131.48 |
Q1 | 138.4 |
Median | 192.3 |
Q3 | 223.24 |
95-th percentile | 227.78 |
Maximum | 228.98 |
Range | 97.74 |
Interquartile range | 84.842 |
Descriptive statistics
Standard deviation | 41.24 |
Coef of variation | 0.23305 |
Kurtosis | -1.8588 |
Mean | 176.96 |
MAD | 40.222 |
Skewness | 0.071448 |
Sum | 13609000 |
Variance | 1700.7 |
Memory size | 1.8 MiB |
Value | Count | Frequency (%) | |
132.71609679999997 | 2080 | 1.8% |
139.1226129 | 1664 | 1.4% |
201.0705712 | 825 | 0.7% |
224.80253140000002 | 783 | 0.7% |
131.537 | 704 | 0.6% |
132.2725714 | 703 | 0.6% |
131.2793548 | 702 | 0.6% |
131.642 | 702 | 0.6% |
131.4784 | 701 | 0.6% |
132.65377420000002 | 698 | 0.6% |
Other values (350) | 67340 | 58.5% |
(Missing) | 38162 | 33.2% |
Minimum 5 values
Value | Count | Frequency (%) | |
131.2362258 | 695 | 0.6% |
131.2793548 | 702 | 0.6% |
131.3258 | 696 | 0.6% |
131.37666670000002 | 695 | 0.6% |
131.4275333 | 693 | 0.6% |
Maximum 5 values
Value | Count | Frequency (%) | |
228.72986380000003 | 401 | 0.3% |
228.7796682 | 208 | 0.2% |
228.8020401 | 60 | 0.1% |
228.8892482 | 60 | 0.1% |
228.9764563 | 186 | 0.2% |
Distinct count | 39 |
Unique (%) | 0.0% |
Missing (%) | 0.0% |
Missing (n) | 0 |
2012-12-21 |
2012-12-07 |
2012-12-28 |
Other values (36) |
Value | Count | Frequency (%) | |
2012-12-21 | 3002 | 2.6% |
2012-12-07 | 2989 | 2.6% |
2012-12-28 | 2988 | 2.6% |
2012-12-14 | 2986 | 2.6% |
2013-02-15 | 2984 | 2.6% |
2012-11-23 | 2976 | 2.6% |
2012-11-09 | 2971 | 2.6% |
2013-01-04 | 2964 | 2.6% |
2013-02-08 | 2964 | 2.6% |
2012-11-30 | 2962 | 2.6% |
Other values (29) | 85278 | 74.1% |
Distinct count | 81 |
Unique (%) | 0.1% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 44.34 |
Minimum | 1 |
Maximum | 99 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | 1 |
5-th percentile | 4 |
Q1 | 18 |
Median | 37 |
Q3 | 74 |
95-th percentile | 95 |
Maximum | 99 |
Range | 98 |
Interquartile range | 56 |
Descriptive statistics
Standard deviation | 30.656 |
Coef of variation | 0.6914 |
Kurtosis | -1.2242 |
Mean | 44.34 |
MAD | 26.74 |
Skewness | 0.36242 |
Sum | 5101883 |
Variance | 939.82 |
Memory size | 1.8 MiB |
Value | Count | Frequency (%) | |
1 | 1755 | 1.5% |
13 | 1755 | 1.5% |
91 | 1755 | 1.5% |
90 | 1755 | 1.5% |
21 | 1755 | 1.5% |
38 | 1755 | 1.5% |
82 | 1755 | 1.5% |
40 | 1755 | 1.5% |
81 | 1755 | 1.5% |
16 | 1755 | 1.5% |
Other values (71) | 97514 | 84.7% |
Minimum 5 values
Value | Count | Frequency (%) | |
1 | 1755 | 1.5% |
2 | 1755 | 1.5% |
3 | 1755 | 1.5% |
4 | 1755 | 1.5% |
5 | 1738 | 1.5% |
Maximum 5 values
Value | Count | Frequency (%) | |
95 | 1755 | 1.5% |
96 | 1350 | 1.2% |
97 | 1716 | 1.5% |
98 | 1632 | 1.4% |
99 | 613 | 0.5% |
Distinct count | 297 |
Unique (%) | 0.3% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 3.5815 |
Minimum | 2.872 |
Maximum | 4.125 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | 2.872 |
5-th percentile | 3.161 |
Q1 | 3.431 |
Median | 3.606 |
Q3 | 3.766 |
95-th percentile | 3.951 |
Maximum | 4.125 |
Range | 1.253 |
Interquartile range | 0.335 |
Descriptive statistics
Standard deviation | 0.23944 |
Coef of variation | 0.066854 |
Kurtosis | -0.1176 |
Mean | 3.5815 |
MAD | 0.18861 |
Skewness | -0.39128 |
Sum | 412110 |
Variance | 0.057332 |
Memory size | 1.8 MiB |
Value | Count | Frequency (%) | |
3.417 | 1853 | 1.6% |
3.583 | 1851 | 1.6% |
3.386 | 1793 | 1.6% |
3.611 | 1374 | 1.2% |
3.108 | 1201 | 1.0% |
3.4789999999999996 | 1169 | 1.0% |
3.597 | 1071 | 0.9% |
3.451 | 1043 | 0.9% |
3.227 | 1040 | 0.9% |
3.614 | 1028 | 0.9% |
Other values (287) | 101641 | 88.3% |
Minimum 5 values
Value | Count | Frequency (%) | |
2.872 | 276 | 0.2% |
2.889 | 276 | 0.2% |
2.9139999999999997 | 193 | 0.2% |
2.927 | 194 | 0.2% |
2.957 | 279 | 0.2% |
Maximum 5 values
Value | Count | Frequency (%) | |
4.079 | 282 | 0.2% |
4.099 | 355 | 0.3% |
4.104 | 186 | 0.2% |
4.109 | 189 | 0.2% |
4.125 | 166 | 0.1% |
Distinct count | 2 |
Unique (%) | 0.0% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Mean | 0.077592 |
True |
(Missing) |
Value | Count | Frequency (%) | |
True | 8928 | 7.8% |
(Missing) | 106136 | 92.2% |
Distinct count | 1753 |
Unique (%) | 1.5% |
Missing (%) | 0.1% |
Missing (n) | 149 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 7689.2 |
Minimum | -2781.4 |
Maximum | 103180 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | -2781.4 |
5-th percentile | 189.49 |
Q1 | 1966.5 |
Median | 4842.3 |
Q3 | 9439.1 |
95-th percentile | 23141 |
Maximum | 103180 |
Range | 105970 |
Interquartile range | 7472.7 |
Descriptive statistics
Standard deviation | 10699 |
Coef of variation | 1.3914 |
Kurtosis | 22.871 |
Mean | 7689.2 |
MAD | 6160.2 |
Skewness | 4.1727 |
Sum | 883610000 |
Variance | 114460000 |
Memory size | 1.8 MiB |
Value | Count | Frequency (%) | |
4655.55 | 74 | 0.1% |
13357.31 | 74 | 0.1% |
22673.11 | 74 | 0.1% |
13613.52 | 74 | 0.1% |
5692.66 | 74 | 0.1% |
10755.57 | 74 | 0.1% |
9753.88 | 74 | 0.1% |
20297.6 | 74 | 0.1% |
5813.45 | 73 | 0.1% |
7701.72 | 73 | 0.1% |
Other values (1742) | 114177 | 99.2% |
(Missing) | 149 | 0.1% |
Minimum 5 values
Value | Count | Frequency (%) | |
-2781.45 | 50 | 0.0% |
-772.21 | 43 | 0.0% |
-563.9 | 70 |