There are lots of operations with pandas that will be really useful to you, but don't fall into any distinct category. Let's show them here.
#Import library and create a Dataframe
import pandas as pd
df = pd.DataFrame({'col1':[1,2,3,4],'col2':[444,555,666,444],'col3':['abc','def','ghi','xyz']})
df.head()
Find out the unique values in data frame
df['col2'].unique()
df['col2'].nunique()
#Find how many times a number has beerepeated in a particular column
df['col2'].value_counts()
Select data on the basis of certain conditions
#Select from DataFrame using criteria from multiple columns
newdf = df[(df['col1']>2) & (df['col2']==444)]
newdf
#Create a square function
def times2(x):
return x*2
#Apply this function on column 1
df['col1'].apply(times2)
df['col3'].apply(len)
#Sum of the element of column 1
df['col1'].sum()
del df['col1']
df
#Find all column in the data frame
df.columns
find all indexes in the data frame
df.index
df
#sorting column values
df.sort_values(by='col2') #inplace=False by default
#It will give you which column has null value which doesn't
df.isnull()
# Drop rows with NaN Values
df.dropna()
import numpy as np
df = pd.DataFrame({'col1':[1,2,3,np.nan],
'col2':[np.nan,555,666,444],
'col3':['abc','def','ghi','xyz']})
df.head()
#Fill NaN values with 'FILL'
df.fillna('FILL')
data = {'A':['foo','foo','foo','bar','bar','bar'],
'B':['one','one','two','two','one','one'],
'C':['x','y','x','y','x','y'],
'D':[1,3,2,5,4,1]}
df = pd.DataFrame(data)
df
df.pivot_table(values='D',index=['A', 'B'],columns=['C'])
df = pd.DataFrame({
'name': ['alice','bob','charlie'],
'age': [25,26,27]
})
df
#Change the column name
df.rename(columns={'name':'person_name','age':'age_in_years'})
df
df = pd.DataFrame({
'name': ['alice','bob','charlie'],
'age': [25,26,27]
})
df
#now changing all names to uppercase
df['name'] = df['name'].map(lambda name: name.upper())
df
import pandas as pd
import numpy as np
df = pd.DataFrame({
'name': ['alice','bob','charlie','david'],
'age': [25,26,27,22]
})
df
#Now use apply() function
df['age_times_2'] = df[['age']].apply(lambda arr: np.multiply(arr,2))
df
So some basic difference between map() and apply() is:
.map()Can only be applied to a single column (one element ata time) where as .apply can be applied to multiple columns at the same time
.map() is very slow but .apply() is much faster when you use when you can use numpy vectorized functions.
df = pd.DataFrame({
'name': ['alice','bob','charlie'],
'age': [25,26,27],
'state': ['ak','ny','dc']
})
df
#Get column values
print(df.columns.values)
#et number of column in the data frame
print(len(df.columns.values))
To reorder columns, just reassign the dataframe with the columns in the order you want:
df = pd.DataFrame({
'age': [25,26,27],
'name': ['alice','bob','charlie'],
'state': ['ak','ny','dc']
})
df
#Now we want name should be the first column, we can do this in following manner
df = df[['name','age','state']]
df
df = pd.DataFrame({
'name': ['alice','bob','charlie'],
'age': [25,26,27]
})
df
#Now lets add a new column called state.
states = pd.Series(['dc','ca','ny'])
df['state'] = states
df
df = pd.DataFrame({
'name': ['alice','bob','charlie'],
'age': [25,26,27]
})
df
# Check the data type of age
print(df['age'].dtype)
#Now convert age datatype to object
df['age'] = df['age'].astype(str)
print(df['age'].dtype)
df = pd.DataFrame({
'name': ['alice','bob','charlie'],
'date_of_birth': ['10/25/2005','10/29/2002','01/01/2001']
})
df
df['date_of_birth'] = pd.to_datetime(df['date_of_birth'])
df
df['date_of_birth'] = pd.to_datetime(df['date_of_birth'],format='%d/%m/%Y')
df