# Pandas - Operations¶

There are lots of operations with pandas that will be really useful to you, but don't fall into any distinct category. Let's show them here.

In [2]:
#Import library and create a Dataframe
import pandas as pd
df = pd.DataFrame({'col1':[1,2,3,4],'col2':[444,555,666,444],'col3':['abc','def','ghi','xyz']})

Out[2]:
col1 col2 col3
0 1 444 abc
1 2 555 def
2 3 666 ghi
3 4 444 xyz

### Info on Unique Values¶

Find out the unique values in data frame

In [3]:
df['col2'].unique()

Out[3]:
array([444, 555, 666], dtype=int64)
In [4]:
df['col2'].nunique()

Out[4]:
3
In [5]:
#Find how many times a number has beerepeated in a particular column
df['col2'].value_counts()

Out[5]:
444    2
555    1
666    1
Name: col2, dtype: int64

### Selecting Data¶

Select data on the basis of certain conditions

In [6]:
#Select from DataFrame using criteria from multiple columns
newdf = df[(df['col1']>2) & (df['col2']==444)]

In [7]:
newdf

Out[7]:
col1 col2 col3
3 4 444 xyz

### Applying Functions¶

In [8]:
#Create a square function
def times2(x):
return x*2

In [9]:
#Apply this function on column 1
df['col1'].apply(times2)

Out[9]:
0    2
1    4
2    6
3    8
Name: col1, dtype: int64
In [10]:
df['col3'].apply(len)

Out[10]:
0    3
1    3
2    3
3    3
Name: col3, dtype: int64
In [11]:
#Sum of the element of column 1
df['col1'].sum()

Out[11]:
10

### Permanently Removing a Column¶

In [12]:
del df['col1']

In [13]:
df

Out[13]:
col2 col3
0 444 abc
1 555 def
2 666 ghi
3 444 xyz

### Get column and index names:¶

In [14]:
#Find all column in the data frame
df.columns

Out[14]:
Index(['col2', 'col3'], dtype='object')
In [15]:
find all indexes in the data frame
df.index

Out[15]:
RangeIndex(start=0, stop=4, step=1)

### Sorting and Ordering a DataFrame¶

In [16]:
df

Out[16]:
col2 col3
0 444 abc
1 555 def
2 666 ghi
3 444 xyz
In [17]:
#sorting column values
df.sort_values(by='col2') #inplace=False by default

Out[17]:
col2 col3
0 444 abc
3 444 xyz
1 555 def
2 666 ghi

### Find Null Values or Check for Null Values¶

In [18]:
#It will give you which column has null value which doesn't
df.isnull()

Out[18]:
col2 col3
0 False False
1 False False
2 False False
3 False False
In [20]:
# Drop rows with NaN Values
df.dropna()

Out[20]:
col2 col3
0 444 abc
1 555 def
2 666 ghi
3 444 xyz

### Filling in NaN values with something else:¶

In [21]:
import numpy as np

In [22]:
df = pd.DataFrame({'col1':[1,2,3,np.nan],
'col2':[np.nan,555,666,444],
'col3':['abc','def','ghi','xyz']})

Out[22]:
col1 col2 col3
0 1.0 NaN abc
1 2.0 555.0 def
2 3.0 666.0 ghi
3 NaN 444.0 xyz
In [23]:
#Fill NaN values with 'FILL'
df.fillna('FILL')

Out[23]:
col1 col2 col3
0 1 FILL abc
1 2 555 def
2 3 666 ghi
3 FILL 444 xyz

### Create Pivot table for data:¶

In [24]:
data = {'A':['foo','foo','foo','bar','bar','bar'],
'B':['one','one','two','two','one','one'],
'C':['x','y','x','y','x','y'],
'D':[1,3,2,5,4,1]}

df = pd.DataFrame(data)

In [25]:
df

Out[25]:
A B C D
0 foo one x 1
1 foo one y 3
2 foo two x 2
3 bar two y 5
4 bar one x 4
5 bar one y 1
In [26]:
df.pivot_table(values='D',index=['A', 'B'],columns=['C'])

Out[26]:
C x y
A B
bar one 4.0 1.0
two NaN 5.0
foo one 1.0 3.0
two 2.0 NaN

### How to change column name in the existing data frame¶

In [28]:
df = pd.DataFrame({
'name': ['alice','bob','charlie'],
'age': [25,26,27]
})

df

Out[28]:
name age
0 alice 25
1 bob 26
2 charlie 27
In [29]:
#Change the column name
df.rename(columns={'name':'person_name','age':'age_in_years'})
df

Out[29]:
name age
0 alice 25
1 bob 26
2 charlie 27

### Applying function to column using .map()¶

In [30]:
df = pd.DataFrame({
'name': ['alice','bob','charlie'],
'age': [25,26,27]
})
df

Out[30]:
name age
0 alice 25
1 bob 26
2 charlie 27
In [32]:
#now changing all names to uppercase
df['name'] = df['name'].map(lambda name: name.upper())
df

Out[32]:
name age
0 ALICE 25
1 BOB 26
2 CHARLIE 27

### Apply() function¶

In [51]:
import pandas as pd
import numpy as np

df = pd.DataFrame({
'name': ['alice','bob','charlie','david'],
'age': [25,26,27,22]

})
df

Out[51]:
name age
0 alice 25
1 bob 26
2 charlie 27
3 david 22
In [52]:
#Now use apply() function
df['age_times_2'] = df[['age']].apply(lambda arr: np.multiply(arr,2))
df

Out[52]:
name age age_times_2
0 alice 25 50
1 bob 26 52
2 charlie 27 54
3 david 22 44

So some basic difference between map() and apply() is:

• .map()Can only be applied to a single column (one element ata time) where as .apply can be applied to multiple columns at the same time

• .map() is very slow but .apply() is much faster when you use when you can use numpy vectorized functions.

### Get column names in Dataframe¶

In [34]:
df = pd.DataFrame({
'name': ['alice','bob','charlie'],
'age': [25,26,27],
'state': ['ak','ny','dc']
})
df

Out[34]:
name age state
0 alice 25 ak
1 bob 26 ny
2 charlie 27 dc
In [35]:
#Get column values
print(df.columns.values)

['name' 'age' 'state']

In [36]:
#et number of column in the data frame
print(len(df.columns.values))

3


### Change order of columns in Dataframe¶

To reorder columns, just reassign the dataframe with the columns in the order you want:

In [38]:
df = pd.DataFrame({
'age': [25,26,27],
'name': ['alice','bob','charlie'],
'state': ['ak','ny','dc']
})
df

Out[38]:
age name state
0 25 alice ak
1 26 bob ny
2 27 charlie dc
In [39]:
#Now we want name should be the first column, we can do this in following manner

df = df[['name','age','state']]
df

Out[39]:
name age state
0 alice 25 ak
1 bob 26 ny
2 charlie 27 dc

### Add/append new column to existing Dataframe¶

In [40]:
df = pd.DataFrame({
'name': ['alice','bob','charlie'],
'age': [25,26,27]
})
df

Out[40]:
name age
0 alice 25
1 bob 26
2 charlie 27
In [41]:
#Now lets add a new column called state.
states = pd.Series(['dc','ca','ny'])

df['state'] = states
df

Out[41]:
name age state
0 alice 25 dc
1 bob 26 ca
2 charlie 27 ny

### Convert the data type¶

In [42]:
df = pd.DataFrame({
'name': ['alice','bob','charlie'],
'age': [25,26,27]
})

df

Out[42]:
name age
0 alice 25
1 bob 26
2 charlie 27
In [44]:
# Check the data type of age
print(df['age'].dtype)

int64

In [45]:
#Now convert age datatype to object
df['age'] = df['age'].astype(str)
print(df['age'].dtype)

object


### Convert column to date/datetime¶

In [46]:
df = pd.DataFrame({
'name': ['alice','bob','charlie'],
'date_of_birth': ['10/25/2005','10/29/2002','01/01/2001']
})
df

Out[46]:
name date_of_birth
0 alice 10/25/2005
1 bob 10/29/2002
2 charlie 01/01/2001
In [48]:
df['date_of_birth'] = pd.to_datetime(df['date_of_birth'])
df

Out[48]:
name date_of_birth
0 alice 2005-10-25
1 bob 2002-10-29
2 charlie 2001-01-01

#### If you want date and time in a specific format¶

In [50]:
df['date_of_birth'] = pd.to_datetime(df['date_of_birth'],format='%d/%m/%Y')
df

Out[50]:
name date_of_birth
0 alice 2005-10-25
1 bob 2002-10-29
2 charlie 2001-01-01
In [ ]: