Students' test performance analysis

Data visualisations made by Janhavi Pimplikar

A student at Pimpri Chinchwad College of Engineering

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
In [2]:
import seaborn as sns
import plotly as py
import cufflinks as cf
In [3]:
py.offline.init_notebook_mode(connected=True)
In [4]:
cf.go_offline()
In [5]:
from plotly.offline import iplot

Univariate analysis of student record

In [6]:
student=pd.read_csv('Studentrecord.csv')
In [7]:
student
Out[7]:
gender race/ethnicity parental level of education lunch test preparation course math score reading score writing score
0 female group B bachelor's degree standard none 72 72 74
1 female group C some college standard completed 69 90 88
2 female group B master's degree standard none 90 95 93
3 male group A associate's degree free/reduced none 47 57 44
4 male group C some college standard none 76 78 75
... ... ... ... ... ... ... ... ...
995 female group E master's degree standard completed 88 99 95
996 male group C high school free/reduced none 62 55 55
997 female group C high school free/reduced completed 59 71 65
998 female group D some college standard completed 68 78 77
999 female group D some college free/reduced none 77 86 86

1000 rows × 8 columns

In [8]:
gender=student['gender'].value_counts()
In [9]:
gender
Out[9]:
female    518
male      482
Name: gender, dtype: int64
In [10]:
explode=(0.0,0.2)
gender.plot.pie(figsize=(7,7),legend=True,autopct='%1.1f%%',fontsize=15,shadow=True,colors=sns.color_palette('Spectral'))
c=plt.Circle((0,0),0.3,color='white')
plt.gca().add_artist(c)
Out[10]:
<matplotlib.patches.Circle at 0x24726a3b708>
In [11]:
student['race/ethnicity']
Out[11]:
0      group B
1      group C
2      group B
3      group A
4      group C
        ...   
995    group E
996    group C
997    group C
998    group D
999    group D
Name: race/ethnicity, Length: 1000, dtype: object
In [12]:
fig,ax=plt.subplots(figsize=(10,5))
graph=sns.countplot('race/ethnicity',data=student,ax=ax)
sns.set_style('darkgrid')
graph.set_xticklabels(graph.get_xticklabels(),size=12)
plt.xlabel('Race/Ethnicity of students',size=15)
plt.ylabel('Number of students',size=15)
plt.title('Race analysis of students',size=20)
Out[12]:
Text(0.5, 1.0, 'Race analysis of students')
In [13]:
edunumber=student['parental level of education'].value_counts()
In [14]:
edunumber
Out[14]:
some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: parental level of education, dtype: int64
In [15]:
edunumber.iplot(kind='bar',legend=True,color='lightgreen',xTitle='Education level',yTitle='Number of students',title='Education level of each student')
In [16]:
fig,ax=plt.subplots(figsize=(10,5))
sns.countplot(student['lunch'],palette='inferno')
plt.style.use('ggplot')
plt.xlabel('Lunch status',size=15)
plt.ylabel('Number of students',size=15)
plt.xticks(size=15)
plt.yticks(size=15)
Out[16]:
(array([  0., 100., 200., 300., 400., 500., 600., 700.]),
 <a list of 8 Text yticklabel objects>)
In [17]:
fig,ax=plt.subplots(figsize=(10,6))
g=sns.distplot(student['math score'],color='orange',kde=False,ax=ax)
sns.set_style('darkgrid')
plt.xlabel('Maths scores',size=15)
plt.xticks(size=15)
plt.yticks(size=15)
g.set_title('Frequency distribution of math scores of students',size=15)
Out[17]:
Text(0.5, 1.0, 'Frequency distribution of math scores of students')
In [18]:
fig,ax=plt.subplots(figsize=(10,6))
g=sns.distplot(student['reading score'],color='blue',kde=False,ax=ax)
sns.set_style('darkgrid')
plt.xlabel('Reading scores',size=15)
plt.xticks(size=15)
plt.yticks(size=15)
g.set_title('Frequency distribution of reading scores of students',size=15)
Out[18]:
Text(0.5, 1.0, 'Frequency distribution of reading scores of students')
In [19]:
fig,ax=plt.subplots(figsize=(10,6))
g=sns.distplot(student['writing score'],color='green',kde=False,ax=ax)
sns.set_style('darkgrid')
plt.xlabel('Writing scores',size=15)
plt.xticks(size=15)
plt.yticks(size=15)
g.set_title('Frequency distribution of writing scores of students',size=15)
Out[19]:
Text(0.5, 1.0, 'Frequency distribution of writing scores of students')

Analysis of female students

In [20]:
fem=student.loc[student['gender']=='female']
In [21]:
fem
Out[21]:
gender race/ethnicity parental level of education lunch test preparation course math score reading score writing score
0 female group B bachelor's degree standard none 72 72 74
1 female group C some college standard completed 69 90 88
2 female group B master's degree standard none 90 95 93
5 female group B associate's degree standard none 71 83 78
6 female group B some college standard completed 88 95 92
... ... ... ... ... ... ... ... ...
993 female group D bachelor's degree free/reduced none 62 72 74
995 female group E master's degree standard completed 88 99 95
997 female group C high school free/reduced completed 59 71 65
998 female group D some college standard completed 68 78 77
999 female group D some college free/reduced none 77 86 86

518 rows × 8 columns

In [22]:
gender=fem['gender'].value_counts()
parental_level_of_education=fem['parental level of education'].value_counts()
fig,ax=plt.subplots(figsize=(8,8),dpi=100)
size=0.3
cmap=plt.get_cmap('Pastel1')
outer_shades=cmap(np.arange(2)*1)
inner_shades=cmap(np.arange(10)*1)
gender.plot.pie(radius=1,colors=outer_shades,wedgeprops=dict(width=size),ax=ax,startangle=60,fontsize=15)
parental_level_of_education.plot.pie(radius=1-size,autopct='%1.2f%%',colors=inner_shades,wedgeprops=dict(width=size),ax=ax,startangle=60,fontsize=10)
plt.ylabel('Parental level of education (females)',size=15)
plt.show()
In [23]:
race=fem['race/ethnicity'].value_counts()
fig,ax=plt.subplots(figsize=(8,8),dpi=100)
size=0.3
cmap=plt.get_cmap('Dark2')
inner_shades=cmap(np.arange(10)*1)
race.plot.pie(radius=1-size,autopct='%1.2f%%',colors=inner_shades,wedgeprops=dict(width=size),ax=ax,startangle=60,fontsize=15,shadow=True,legend=True)
plt.legend(loc='best',title='Racial groups')
plt.ylabel('Racial groups (females)',size=15)
plt.show()
In [24]:
femscores=fem[['math score','reading score','writing score']]
In [25]:
femscores
Out[25]:
math score reading score writing score
0 72 72 74
1 69 90 88
2 90 95 93
5 71 83 78
6 88 95 92
... ... ... ...
993 62 72 74
995 88 99 95
997 59 71 65
998 68 78 77
999 77 86 86

518 rows × 3 columns

In [26]:
femscores.iplot(kind='box',legend=True,xTitle='Subjects',yTitle='Score',title='Score analysis of female students',colors={'math score':'fuchsia','reading score':'purple','writing score':'yellow'})

Analysis of male students

In [27]:
male=student.loc[student['gender']=='male']
In [28]:
male
Out[28]:
gender race/ethnicity parental level of education lunch test preparation course math score reading score writing score
3 male group A associate's degree free/reduced none 47 57 44
4 male group C some college standard none 76 78 75
7 male group B some college free/reduced none 40 43 39
8 male group D high school free/reduced completed 64 64 67
10 male group C associate's degree standard none 58 54 52
... ... ... ... ... ... ... ... ...
985 male group A high school standard none 57 51 54
987 male group E some high school standard completed 81 75 76
990 male group E high school free/reduced completed 86 81 75
994 male group A high school standard none 63 63 62
996 male group C high school free/reduced none 62 55 55

482 rows × 8 columns

In [29]:
gender1=male['gender'].value_counts()
parental_level_of_education=male['parental level of education'].value_counts()
fig,ax=plt.subplots(figsize=(8,8),dpi=100)
size=0.3
cmap=plt.get_cmap('Pastel2')
outer_shades=cmap(np.arange(2)*1)
inner_shades=cmap(np.arange(10)*1)
plots=gender1.plot.pie(radius=1,colors=outer_shades,wedgeprops=dict(width=size),ax=ax,startangle=60,fontsize=15)
parental_level_of_education.plot.pie(radius=1-size,autopct='%1.2f%%',colors=inner_shades,wedgeprops=dict(width=size),ax=ax,startangle=60,fontsize=10)
plt.ylabel('Parental level of education (males)',size=15)
plt.show()
In [30]:
race1=male['race/ethnicity'].value_counts()
fig,ax=plt.subplots(figsize=(8,8),dpi=100)
size=0.3
cmap=plt.get_cmap('Set1')
inner_shades=cmap(np.arange(10)*1)
race1.plot.pie(radius=1-size,autopct='%1.2f%%',colors=inner_shades,wedgeprops=dict(width=size),ax=ax,startangle=60,fontsize=15,shadow=True,legend=True)
plt.legend(loc='best',title='Racial groups')
plt.ylabel('Racial groups (males)',size=15)
plt.show()
In [31]:
malescores=male[['math score','reading score','writing score']]
In [32]:
malescores
Out[32]:
math score reading score writing score
3 47 57 44
4 76 78 75
7 40 43 39
8 64 64 67
10 58 54 52
... ... ... ...
985 57 51 54
987 81 75 76
990 86 81 75
994 63 63 62
996 62 55 55

482 rows × 3 columns

In [33]:
malescores.iplot(kind='box',legend=True,xTitle='Subjects',yTitle='Score',title='Score analysis of male students',colors={'math score':'red','reading score':'navy','writing score':'brown'})

Bivariate scores analysis of students (both genders)

In [34]:
sns.catplot(x='race/ethnicity',y='math score',hue='gender',data=student,height=6,aspect=2,palette='CMRmap')
sns.set_style('whitegrid')
sns.despine(right=False)
plt.xticks(size=15,rotation=45)
plt.xlabel('Race/Ethnicity',size=15,color='red')
plt.ylabel('Math score',size=15,color='red')
plt.title('Math score analysis by race',size=20)
Out[34]:
Text(0.5, 1, 'Math score analysis by race')
In [35]:
sns.catplot(x='race/ethnicity',y='reading score',hue='gender',data=student,height=6,aspect=2,palette='gnuplot2')
sns.set_style('whitegrid')
sns.despine(right=False)
plt.xticks(size=15,rotation=45)
plt.xlabel('Race/Ethnicity',size=15,color='red')
plt.ylabel('Reading score',size=15,color='red')
plt.title('Reading score analysis by race',size=20)
Out[35]:
Text(0.5, 1, 'Reading score analysis by race')
In [36]:
sns.catplot(x='race/ethnicity',y='writing score',hue='gender',data=student,height=6,aspect=2,palette='gist_rainbow')
sns.set_style('whitegrid')
sns.despine(right=False)
plt.xticks(size=15,rotation=45)
plt.xlabel('Race/Ethnicity',size=15,color='red')
plt.ylabel('Writing score',size=15,color='red')
plt.title('Writing score analysis by race',size=20)
Out[36]:
Text(0.5, 1, 'Writing score analysis by race')
In [37]:
mathedu=student[['gender','parental level of education','math score','reading score','writing score']]
In [38]:
mathedu=mathedu.head(300)
In [39]:
total=mathedu['math score'] + mathedu['reading score'] + mathedu['writing score']  #adding column to an existing subset of dataframe
In [40]:
mathedu['total score']=total
In [41]:
mathedu
Out[41]:
gender parental level of education math score reading score writing score total score
0 female bachelor's degree 72 72 74 218
1 female some college 69 90 88 247
2 female master's degree 90 95 93 278
3 male associate's degree 47 57 44 148
4 male some college 76 78 75 229
... ... ... ... ... ... ...
295 male associate's degree 67 62 60 189
296 male some high school 46 41 43 130
297 male associate's degree 71 74 68 213
298 male high school 40 46 50 136
299 male associate's degree 90 87 75 252

300 rows × 6 columns

In [42]:
ax=sns.catplot(x='parental level of education',y='total score',hue='gender',data=mathedu,kind='swarm',height=6,aspect=2,legend=True,palette='CMRmap')
sns.set_style('darkgrid')
plt.xlabel('Parental level of education',size=15)
plt.xticks(size=15)
plt.yticks(size=15)
plt.ylabel('Total score',size=15)
plt.title('Score analysis of first 300 students with respect to educational background and gender',size=15)
plt.legend(fontsize='xx-large', title_fontsize='40') #works with catplot and also provides better visibiltiy to legend
Out[42]:
<matplotlib.legend.Legend at 0x24728c44388>
In [43]:
mathedu=mathedu[['math score','reading score','writing score']]
In [44]:
mathedu
Out[44]:
math score reading score writing score
0 72 72 74
1 69 90 88
2 90 95 93
3 47 57 44
4 76 78 75
... ... ... ...
295 67 62 60
296 46 41 43
297 71 74 68
298 40 46 50
299 90 87 75

300 rows × 3 columns

In [45]:
mathedu.iplot(kind='box',xTitle='Subjects',yTitle='Scores',title='Scores analysis of each subject (300 students)')
In [ ]:
 

dataset imported from https://www.kaggle.com/spscientist/students-performance-in-exams?select=StudentsPerformance.csv

attributes: Websites such as stackoverflow.com, medium.com, geeksforgeeks.com etc.

In [ ]: