2+2
print(4+6)
print 5+5 #doesn't work, use parent.
4 #int
4.3 # float
"hello world" # string
True #boolean
False #boolean
None # undefined
type(4) # type() is useful for figuring out what kind of data you have
type(5.4)
type("hello world")
int(6.42) #convert float -> int
str(5) # convert int -> string
#group question
#How can we make this work?
print(5 + " is a number") # not work
# group question
type(1/2) # float, int, string?
5 > 4
# 5 >= 4, 5 <= 4, 5 == 4, 5 < 4, 5> 4
5 == 4 # false
5.0 == 5 # convert, True
5.0 == "5" # false
(5 > 4) and (5 < 3)
#group question
(5 > 4) or (5 < 3)
x = 3.2
y = "hello world"
helloHappyPeople = "hello world"
helloHappyPeople # tab to autocomplete
z = "3.2"
#group question
x == z # False or True?
#group question
x = True
y = False
z = False
if x or y and z:
print('yes')
else:
print('no')
# will the output be yes or no?
#group question
x or (y and z)
#group question
x or y or z
names_tuples = ('alice','bob','sam') # tuple, are not modifiable (immutable)
names_list = ['alice','bob','sam'] #list, are changeable (mutable)
names_tuples.append('jake')
names_list.append('jake')
names_list
names_list.insert(3,'chelsea') # chelsea inserted at position 3
names_list
names_list[1] # to get bob
names_list.pop() # pull from end
names_list
names_list.index('sam') #tell me where sam is in the list
len(names_list) # give me the length --> 3 or 4
a = [[1,2],[4,5]]
a[1][0] # will I get 1,2,4, or 5?
#group quesiton
a = [1,2,3,None,(),[],] # what is len(a)? 4,5,6,7 or error?
# concat lists
list1 = [1,2,3,4]
list2 = [5,6,7,8]
list1 + list2
#group question
#What is the answer?
"apple" + "bana"
class Person:
fullname = None # this is a field
weight = 0 # also a field
height = 0 # also a field
def __init__(self,name,w,h): # constructor (make a new person)
self.fullname = name
self.weight = w
self.height = h
def getFullName(self): # give me your name
return(self.fullname)
def setWeight(self,newweight): # update your weight
self.weight = newweight
def getBMI(self):
return(int(703*self.weight/(self.height*self.height)))
alex = Person("alex",150,68) # make a new instance of person
jane = Person("jane",130,68) # make a new instance
alex.getFullName() # what is your name?
jane.getFullName() # what is your name?
alex.getBMI() # what is your BMI?
alex.setWeight(190)
alex.getBMI()
import numpy as np #load up the libraries and object defs. we need
import pandas as pd
from pandas import DataFrame, Series
# tell ipython notebook to print visualizations into chrome
%pylab
%matplotlib inline
# load up my visualization system, and call the object plt
import matplotlib.pyplot as plt
# defined a new class with students, years, and grades
myclass = pd.DataFrame({'student':['alice','bob','louis','jen'],\
'year':[4,4,3,3],\
'grade':[10,9,10,10]})
myclass # show me what the class looks like
myclass.shape # how many rows and columns
myclass.columns # give me the column names
myclass.year.unique() # give me the unique years
pd.crosstab(myclass.grade,myclass.year) # count me how many people are in each condition
plt.hist(myclass.grade)
pd.crosstab(myclass.year,myclass.grade) #order reversed (x/y)
myclass.info()
myclass.describe()
myclass.T # get the transpose
myclass = myclass.set_index('student') # make a new dataframe based on myclass, BUT with student as the main/index key
myclass
yrg = myclass.groupby('year') # partition into groups I care about
yrg.describe() # describe them (statistical props. of each group)
myclass.grade >= 10
goodgrades = myclass.grade >= 10 # created a filter for students with good grades
goodgrades
myclass[goodgrades] # apply the filter
myclass
myclass.year == 4 # all people who are seniors
(myclass.year == 4) & (myclass.grade >= 10) # & = and for bit operations
flt = (myclass.year == 4) & (myclass.grade >= 10) # make the filtering criteria
myclass[flt]
olive_oil = pd.read_csv('olive.csv') # load up the file
import matplotlib.pyplot as plt
import pandas as pd #this is how I usually import pandas
import sys #only needed to determine Python version number
import matplotlib #only needed to determine Matplotlib version number
# Enable inline plotting
%matplotlib inline
statsmodels
package.¶statsmodels
is a Python package that provides various statistical analysis tools. We will use statsmodels
for this lab. The package does not come with Anaconda
by default, but we can easily install it.
Terminal
(or cmd
in Windows)conda install statsmodels
source activate python3
" on Mac or "activate python3
" on Windows), then do the following
conda install --name=python3 statsmodels
where python3
should be whatever the name of your conda environment is.#Even though this functions has many parameters, we will simply pass it the location of the text file.
#Location = C:\Users\fatem_000\OneDrive\Academic\2014 Summer\TA\599-VIS-Fall2016\Lab2\seeds-subset.csv
#Note: Depending on where you save your notebooks, you may need to modify the location above.
Location = r'C:\Users\fara\OneDrive\Academic\2014 Summer\TA\599-VIS-Fall2016\Lab2\olive.csv'
olive_oil = pd.read_csv(Location) # load up the file
olive_oil
Let us start with necessary imports and data loading. Just execute every cell below. Remember: a convenient shortcut to run a cell and then jumps to the next cell is Shirt + Enter
.
import numpy as np import pandas as pd import matplotlib.pyplot as plt %matplotlib inline
# This is the Anscombe's quartet
# Source - Wikipedia: https://en.wikipedia.org/wiki/Anscombe%27s_quartet
from io import StringIO
TESTDATA=StringIO("""X1,Y1,X2,Y2,X3,Y3,X4,Y4
10,8.04,10,9.14,10,7.46,8,6.58
8,6.95,8,8.14,8,6.77,8,5.76
13,7.58,13,8.74,13,12.74,8,7.71
9,8.81,9,8.77,9,7.11,8,8.84
11,8.33,11,9.26,11,7.81,8,8.47
14,9.96,14,8.1,14,8.84,8,7.04
6,7.24,6,6.13,6,6.08,8,5.25
4,4.26,4,3.1,4,5.39,19,12.5
12,10.84,12,9.13,12,8.15,8,5.56
7,4.82,7,7.26,7,6.42,8,7.91
5,5.68,5,4.74,5,5.73,8,6.89""")
df = pd.DataFrame.from_csv(TESTDATA, index_col=None)
df
This dataset contains 4 groups of data: (X1, Y1)
, to (X4, Y4)
.
It is not hard to notice that X1
, X2
, and X3
are identical. But that doesn't matter now.
We can do a quick vis of the data by creating scatterplots.
fig, axs = plt.subplots(2, 2, figsize=(12,9))
for i, ax in enumerate(axs.flat):
j = i + 1
ax.scatter(df['X%d'%j], df['Y%d'%j])
ax.set_title('(%d)'%j)
ax.set_xlabel('X%d'%j)
ax.set_ylabel('Y%d'%j)
ax.grid(True)
Pandas DataFrames
and Series
come with a couple of convenient functions for computing basic summary statistics. Try the following commands. Their meanings are quite self-explanatory.
df.mean()
df.median()
df.std()
df.max()
# This one computes the maximum along the 1st axies (i.e,. across columns).
df.max(axis=1)
df.var()
You can also apply the functions on a column (i.e,. a Series), too. Such as...
df.X1.mean()
# This one is slightly more complicated... But you can figure this out easily.
(df.X1 + df.X2).mean()
# And finally...
df.describe()
To quickly understand the distribution of data, it is a good idea to use binning and grouping and creating histograms.
Use the following code to create bins based on X1
's values, and check the mean value of both X1
and Y1
within each bin.
# Since we have very few data points, I will only use 3 bins (by setting num = 4)
bins_by_x1 = np.linspace(start=3, stop=15, num=4)
groups_by_x1 = df[['X1','Y1']].groupby(pd.cut(df.X1, bins_by_x1))
groups_by_x1.mean()
The above result shows the mean value of X1
and Y1
binned by the value of X1
.
To understand what is going on in the above commands, feel free to print out the intermediate variables, including:
bins_by_x1
pd.cut(df.X1, bins_by_x1)
groups_by_x1
Also try the following commands:
groups_by_x1.median()
groups_by_x1.size().to_frame(name='count')
In addition to using existing aggregate functions (i.e., max, min, mean, median, size
, etc.), you can also define custom functions to "apply
" to the grouping object.
The following should generate the same result as the previous one. Try to figure out how it works.
groups_by_x1.apply(lambda x: len(x)).to_frame(name='count')
# Or, alternatively, and more confusingly ...
groups_by_x1.apply(lambda x: pd.Series({'count': len(x)}))
To create a histogram, simply use .hist()
on a Series
object. It automatically handles binning, grouping, and counting.
By default, hist()
generates 10 bins. You may customize this by specifying the bins
parameter, for example, hist(bins=5)
.
Try the following.
df.Y1.hist(bins=5)
you may call hist()
on a DataFrame
object and obtain a panel of histograms, one for each individual column.
_ = df[['Y1','Y2','Y3','Y4']].hist(bins=5, figsize=(10,6))
In the above example, notice two things:
figsize=(10, 6)
to specify the size of the figure (the unit is inch, although the %matplotlib inline
configuration reduces the figure sizes by a predefined ratio automatically)._ =
" in the front to avoid seeing the returned value of df.hist
, which I do not care. If you are curious, it is an array of axes
objects of matplotlib
. _
can be used whenever you want to ignore the return value of a function.You can also make histograms that have side-by-side bars for multiple variables. For the following example, I used matplotlib
's hist
function instead of the equivalence of pandas
, because it is easier to use the former one to make side-by-side histograms like this.
bins = np.linspace(start=2, stop=12, num=6)
plt.hist(df[['Y1','Y2','Y3','Y4']].values,
bins,
label=['Y1','Y2','Y3','Y4'])
plt.legend(loc="upper left")
plt.grid(True)
Boxplots (or box and whisker diagrams) is a another good way of depicting the distribution of numerical data by showing the mean, min, max, and interquantile range(IQR).
Using pandas
, it is very easy to create a boxplot of multiple variables.
_ = df[['X1','Y1']].boxplot()