Observation 1: Overall Reading percentages are greater compared to Overall Math percentages.
Observation 2: Charter-type Schools have better performance compared to District-type Schools.
Observation 3: The School Spending budget on each student is inversely proportional to the overall pass percentages of the school.
Observation 4: Large sized Schools has lower overall pass percentage compared to Medium and Small sized Schools.
#Import Dependencies
import pandas as pd
import os
import numpy as np
Reading both CSV files(Students and Schools)
#File Path for students
StudentsFilePath = os .path .join ("raw_data" ,"students_complete.csv" )
#Open and read the file
Student_DF = pd .read_csv (StudentsFilePath )
#File Path for School
SchoolFilePath = os .path .join ("raw_data" ,"schools_complete.csv" )
#Open and read the file
Schools_DF = pd .read_csv (SchoolFilePath )
#Renaming both dataframe school name column to the same name
Student_DF = Student_DF .rename (columns = {"school" :"School Name" })
Schools_DF = Schools_DF .rename (columns = {"name" :"School Name" })
#District Summary
Total_Students = len (Student_DF ["Student ID" ].unique ())
District_Summary = pd .DataFrame ({"Total Schools" :[len (Schools_DF ["School ID" ].unique ())],
"Total Students" :[Total_Students ],
"Total Budget" :"$" + str (Schools_DF ["budget" ].sum ()),
"Average Math Score" :[round (Student_DF ["math_score" ].mean (),2 )],
"Average Reading Score" :[round (Student_DF ["reading_score" ].mean (),2 )],
"% Passing Math" :[(len (Student_DF [Student_DF ["math_score" ]>= 70 ])/ Total_Students )* 100 ],
"% Passing Reading" :[(len (Student_DF [Student_DF ["reading_score" ]>= 70 ])/ Total_Students )* 100 ]
})
District_Summary ["% Overall Passing Rate" ]= (District_Summary ["% Passing Math" ]+ District_Summary ["% Passing Reading" ])/ 2
#Rearranging the order of columns
District_Summary = District_Summary [["Total Schools" ,"Total Students" ,"Total Budget" ,
"Average Math Score" ,"Average Reading Score" ,
"% Passing Math" ,"% Passing Reading" ,"% Overall Passing Rate" ]]
#Formatting the columns to the desire format
District_Summary ["% Passing Math" ]= District_Summary ["% Passing Math" ].map ("{:.2f}%" .format )
District_Summary ["% Passing Reading" ]= District_Summary ["% Passing Reading" ].map ("{:.2f}%" .format )
District_Summary ["% Overall Passing Rate" ]= District_Summary ["% Overall Passing Rate" ].map ("{:.2f}%" .format )
District_Summary
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
Total Schools
Total Students
Total Budget
Average Math Score
Average Reading Score
% Passing Math
% Passing Reading
% Overall Passing Rate
0
15
39170
$24649428
78.99
81.88
74.98%
85.81%
80.39%
#Combined school and student data
School_data_complete = pd .merge (Schools_DF ,Student_DF ,on = "School Name" ,how = "outer" ).groupby (["School Name" ],as_index = True )
#Calculating and storing all value of each school into lists
School_Type = School_data_complete ["type" ].unique ().str .get (0 )
Total_Budget = School_data_complete ["budget" ].unique ().str .get (0 )
Total_Students = School_data_complete .count ()["Student ID" ]
per_Student_Budget = Total_Budget / Total_Students
avg_math_score = School_data_complete .mean ()["math_score" ]
avg_reading_score = School_data_complete .mean ()["reading_score" ]
Count_Percent_Passing_Math = Student_DF .loc [Student_DF ["math_score" ]>= 70 ,:].groupby ([Student_DF ["School Name" ]],as_index = True ).count ()["math_score" ]
Percent_Passing_Math = round (Count_Percent_Passing_Math * 100 / Total_Students ,2 )
Count_Percent_Passing_Reading = Student_DF .loc [Student_DF ["reading_score" ]>= 70 ,:].groupby ([Student_DF ["School Name" ]],as_index = True ).count ()["reading_score" ]
Percent_Passing_Reading = round (Count_Percent_Passing_Reading * 100 / Total_Students ,2 )
Percent_Overall_Passing = round ((Percent_Passing_Math + Percent_Passing_Reading )/ 2 ,2 )
#School Summary DataFrame
School_Summary = pd .DataFrame ({
"School Type" :School_Type ,
"Total Students" :Total_Students ,
"Total School Budget" :Total_Budget ,
"Per Student Budget" :per_Student_Budget ,
"Average Math Score" :round (avg_math_score ,2 ),
"Average Reading Score" :round (avg_reading_score ,2 ),
"% Passing Maths" :Percent_Passing_Math ,
"% Passing Reading" :Percent_Passing_Reading ,
"% Overall Passing Rate" :Percent_Overall_Passing
})
#Formatting the columns to the desire format
School_Summary ["Total School Budget" ]= School_Summary ["Total School Budget" ].map ("${:,}" .format )
School_Summary ["Per Student Budget" ]= School_Summary ["Per Student Budget" ].astype (int ).map ("${}" .format )
School_Summary ["% Passing Maths" ]= School_Summary ["% Passing Maths" ].map ("{:.2f}%" .format )
School_Summary ["% Passing Reading" ]= School_Summary ["% Passing Reading" ].map ("{:.2f}%" .format )
School_Summary ["% Overall Passing Rate" ]= School_Summary ["% Overall Passing Rate" ].map ("{:.2f}%" .format )
#Rearranging the order of columns
School_Summary = School_Summary [["School Type" ,"Total Students" ,"Total School Budget" ,
"Per Student Budget" ,"Average Math Score" ,"Average Reading Score" ,
"% Passing Maths" ,"% Passing Reading" ,"% Overall Passing Rate" ]]
School_Summary
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
School Type
Total Students
Total School Budget
Per Student Budget
Average Math Score
Average Reading Score
% Passing Maths
% Passing Reading
% Overall Passing Rate
School Name
Bailey High School
District
4976
$3,124,928
$628
77.05
81.03
66.68%
81.93%
74.31%
Cabrera High School
Charter
1858
$1,081,356
$582
83.06
83.98
94.13%
97.04%
95.58%
Figueroa High School
District
2949
$1,884,411
$639
76.71
81.16
65.99%
80.74%
73.36%
Ford High School
District
2739
$1,763,916
$644
77.10
80.75
68.31%
79.30%
73.81%
Griffin High School
Charter
1468
$917,500
$625
83.35
83.82
93.39%
97.14%
95.26%
Hernandez High School
District
4635
$3,022,020
$652
77.29
80.93
66.75%
80.86%
73.81%
Holden High School
Charter
427
$248,087
$581
83.80
83.81
92.51%
96.25%
94.38%
Huang High School
District
2917
$1,910,635
$655
76.63
81.18
65.68%
81.32%
73.50%
Johnson High School
District
4761
$3,094,650
$650
77.07
80.97
66.06%
81.22%
73.64%
Pena High School
Charter
962
$585,858
$609
83.84
84.04
94.59%
95.95%
95.27%
Rodriguez High School
District
3999
$2,547,363
$637
76.84
80.74
66.37%
80.22%
73.30%
Shelton High School
Charter
1761
$1,056,600
$600
83.36
83.73
93.87%
95.85%
94.86%
Thomas High School
Charter
1635
$1,043,130
$638
83.42
83.85
93.27%
97.31%
95.29%
Wilson High School
Charter
2283
$1,319,574
$578
83.27
83.99
93.87%
96.54%
95.21%
Wright High School
Charter
1800
$1,049,400
$583
83.68
83.96
93.33%
96.61%
94.97%
#Top Performing Schools based on Overall Passing Rate
Top_Performing_Schools = School_Summary .sort_values ("% Overall Passing Rate" ,ascending = False ).head ()
Top_Performing_Schools
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
School Type
Total Students
Total School Budget
Per Student Budget
Average Math Score
Average Reading Score
% Passing Maths
% Passing Reading
% Overall Passing Rate
School Name
Cabrera High School
Charter
1858
$1,081,356
$582
83.06
83.98
94.13%
97.04%
95.58%
Thomas High School
Charter
1635
$1,043,130
$638
83.42
83.85
93.27%
97.31%
95.29%
Pena High School
Charter
962
$585,858
$609
83.84
84.04
94.59%
95.95%
95.27%
Griffin High School
Charter
1468
$917,500
$625
83.35
83.82
93.39%
97.14%
95.26%
Wilson High School
Charter
2283
$1,319,574
$578
83.27
83.99
93.87%
96.54%
95.21%
#Bottom Performing Schools based on Overall Passing Rate
Bottom_Performing_Schools = School_Summary .sort_values ("% Overall Passing Rate" ,ascending = True ).head ()
Bottom_Performing_Schools
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
School Type
Total Students
Total School Budget
Per Student Budget
Average Math Score
Average Reading Score
% Passing Maths
% Passing Reading
% Overall Passing Rate
School Name
Rodriguez High School
District
3999
$2,547,363
$637
76.84
80.74
66.37%
80.22%
73.30%
Figueroa High School
District
2949
$1,884,411
$639
76.71
81.16
65.99%
80.74%
73.36%
Huang High School
District
2917
$1,910,635
$655
76.63
81.18
65.68%
81.32%
73.50%
Johnson High School
District
4761
$3,094,650
$650
77.07
80.97
66.06%
81.22%
73.64%
Ford High School
District
2739
$1,763,916
$644
77.10
80.75
68.31%
79.30%
73.81%
Math Scores based on Grades for each School
#Math Scores by grade
Grouped_SchoolName_Grade = Student_DF .groupby ([Student_DF ["School Name" ],Student_DF ["grade" ]],as_index = False )
Math_Score_Grade_DF = Grouped_SchoolName_Grade ["math_score" ].mean ()
#Creating lists
math_SchoolNames = []
math_mean_grade_9th = []
math_mean_grade_10th = []
math_mean_grade_11th = []
math_mean_grade_12th = []
for school in Math_Score_Grade_DF ["School Name" ].unique ():
School_Name = school
math_SchoolNames .append (School_Name )
index = math_SchoolNames .index (school )
math_mean_grade_9th .insert (index ,round (Math_Score_Grade_DF .loc [(Math_Score_Grade_DF ["School Name" ]== school ) & (Math_Score_Grade_DF ["grade" ]== "9th" ),"math_score" ].values [0 ],2 ))
math_mean_grade_10th .insert (index ,round (Math_Score_Grade_DF .loc [(Math_Score_Grade_DF ["School Name" ]== school ) & (Math_Score_Grade_DF ["grade" ]== "10th" ),"math_score" ].values [0 ],2 ))
math_mean_grade_11th .insert (index ,round (Math_Score_Grade_DF .loc [(Math_Score_Grade_DF ["School Name" ]== school ) & (Math_Score_Grade_DF ["grade" ]== "11th" ),"math_score" ].values [0 ],2 ))
math_mean_grade_12th .insert (index ,round (Math_Score_Grade_DF .loc [(Math_Score_Grade_DF ["School Name" ]== school ) & (Math_Score_Grade_DF ["grade" ]== "12th" ),"math_score" ].values [0 ],2 ))
#Creating the dataframe of Math Scores based on the Grade
New_Math_DF = pd .DataFrame ({"School Name" :math_SchoolNames ,
"9th" :math_mean_grade_9th ,
"10th" :math_mean_grade_10th ,
"11th" :math_mean_grade_11th ,
"12th" :math_mean_grade_12th })
#Rearranging the columns
New_Math_DF = New_Math_DF [["School Name" ,"9th" ,"10th" ,"11th" ,"12th" ]]
#Setting the index
New_Math_DF .set_index ("School Name" ,inplace = True )
New_Math_DF
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
9th
10th
11th
12th
School Name
Bailey High School
77.08
77.00
77.52
76.49
Cabrera High School
83.09
83.15
82.77
83.28
Figueroa High School
76.40
76.54
76.88
77.15
Ford High School
77.36
77.67
76.92
76.18
Griffin High School
82.04
84.23
83.84
83.36
Hernandez High School
77.44
77.34
77.14
77.19
Holden High School
83.79
83.43
85.00
82.86
Huang High School
77.03
75.91
76.45
77.23
Johnson High School
77.19
76.69
77.49
76.86
Pena High School
83.63
83.37
84.33
84.12
Rodriguez High School
76.86
76.61
76.40
77.69
Shelton High School
83.42
82.92
83.38
83.78
Thomas High School
83.59
83.09
83.50
83.50
Wilson High School
83.09
83.72
83.20
83.04
Wright High School
83.26
84.01
83.84
83.64
Reading Scores based on Grades for each School
#Reading Scores by grade
Grouped_SchoolName_Grade = Student_DF .groupby ([Student_DF ["School Name" ],Student_DF ["grade" ]],as_index = False )
Reading_Score_Grade_DF = Grouped_SchoolName_Grade ["reading_score" ].mean ()
#Creating lists
reading_SchoolNames = []
reading_mean_grade_9th = []
reading_mean_grade_10th = []
reading_mean_grade_11th = []
reading_mean_grade_12th = []
for school in Reading_Score_Grade_DF ["School Name" ].unique ():
School_Name = school
reading_SchoolNames .append (School_Name )
index = reading_SchoolNames .index (school )
reading_mean_grade_9th .insert (index ,round (Reading_Score_Grade_DF .loc [(Reading_Score_Grade_DF ["School Name" ]== school ) & (Reading_Score_Grade_DF ["grade" ]== "9th" ),"reading_score" ].values [0 ],2 ))
reading_mean_grade_10th .insert (index ,round (Reading_Score_Grade_DF .loc [(Reading_Score_Grade_DF ["School Name" ]== school ) & (Reading_Score_Grade_DF ["grade" ]== "10th" ),"reading_score" ].values [0 ],2 ))
reading_mean_grade_11th .insert (index ,round (Reading_Score_Grade_DF .loc [(Reading_Score_Grade_DF ["School Name" ]== school ) & (Reading_Score_Grade_DF ["grade" ]== "11th" ),"reading_score" ].values [0 ],2 ))
reading_mean_grade_12th .insert (index ,round (Reading_Score_Grade_DF .loc [(Reading_Score_Grade_DF ["School Name" ]== school ) & (Reading_Score_Grade_DF ["grade" ]== "12th" ),"reading_score" ].values [0 ],2 ))
#Creating the dataframe of Math Scores based on the Grade
New_Reading_DF = pd .DataFrame ({"School Name" :reading_SchoolNames ,
"9th" :reading_mean_grade_9th ,
"10th" :reading_mean_grade_10th ,
"11th" :reading_mean_grade_11th ,
"12th" :reading_mean_grade_12th })
#Rearranging the columns
New_Reading_DF = New_Reading_DF [["School Name" ,"9th" ,"10th" ,"11th" ,"12th" ]]
#Setting the index
New_Reading_DF .set_index ("School Name" ,inplace = True )
New_Reading_DF
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
9th
10th
11th
12th
School Name
Bailey High School
81.30
80.91
80.95
80.91
Cabrera High School
83.68
84.25
83.79
84.29
Figueroa High School
81.20
81.41
80.64
81.38
Ford High School
80.63
81.26
80.40
80.66
Griffin High School
83.37
83.71
84.29
84.01
Hernandez High School
80.87
80.66
81.40
80.86
Holden High School
83.68
83.32
83.82
84.70
Huang High School
81.29
81.51
81.42
80.31
Johnson High School
81.26
80.77
80.62
81.23
Pena High School
83.81
83.61
84.34
84.59
Rodriguez High School
80.99
80.63
80.86
80.38
Shelton High School
84.12
83.44
84.37
82.78
Thomas High School
83.73
84.25
83.59
83.83
Wilson High School
83.94
84.02
83.76
84.32
Wright High School
83.83
83.81
84.16
84.07
Scores based on the School Spending for each student
#Scores by School Spending
bins = [0 ,584 ,614 ,644 ,675 ]
label_value = ["<$585" ,"$585-615" ,"$615-645" ,"$645-675" ]
New_School_Summary = School_Summary
New_School_Summary ["% Passing Maths" ]= round (New_School_Summary ["% Passing Maths" ].str .replace ("%" , '' ).astype (float ),2 )
New_School_Summary ["% Passing Reading" ]= round (New_School_Summary ["% Passing Reading" ].str .replace ("%" , '' ).astype (float ),2 )
New_School_Summary ["% Overall Passing Rate" ]= round (New_School_Summary ["% Overall Passing Rate" ].str .replace ("%" , '' ).astype (float ),2 )
#Cutting based on the bins
New_School_Summary ["Spending Ranges (Per Student)" ]= pd .cut (New_School_Summary ["Per Student Budget" ].str .replace ("$" , '' ).astype (float ),bins ,labels = label_value )
#Grouping based on Spending Ranges (Per Student)
Grouped_School_Summary_Spending = New_School_Summary .groupby ("Spending Ranges (Per Student)" ,as_index = False ).mean ()
#Extracting the required columns from the dataframe
Grouped_School_Summary_Spending = Grouped_School_Summary_Spending [["Spending Ranges (Per Student)" ,"Average Math Score" ,
"Average Reading Score" ,"% Passing Maths" ,
"% Passing Reading" ,"% Overall Passing Rate" ]]
#Setting the index
Grouped_School_Summary_Spending .set_index ("Spending Ranges (Per Student)" ,inplace = True )
Grouped_School_Summary_Spending
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
Average Math Score
Average Reading Score
% Passing Maths
% Passing Reading
% Overall Passing Rate
Spending Ranges (Per Student)
<$585
83.452500
83.935000
93.460000
96.610000
95.035000
$585-615
83.600000
83.885000
94.230000
95.900000
95.065000
$615-645
79.078333
81.891667
75.668333
86.106667
80.888333
$645-675
76.996667
81.026667
66.163333
81.133333
73.650000
Scores based on the School Size (i.e., Total Students in each school)
#Scores by School Size
bins = [0 ,999 ,1999 ,5000 ]
label_value = ["Small (<1000)" ,"Medium (1000-2000)" ,"Large (2000-5000)" ]
#Cutting based on the bins
New_School_Summary ["School Size" ]= pd .cut (New_School_Summary ["Total Students" ],bins ,labels = label_value )
#Grouping based on School Size
Grouped_School_Summary_Size = New_School_Summary .groupby ("School Size" ,as_index = False ).mean ()
#Setting the index
Grouped_School_Summary_Size .set_index ("School Size" ,inplace = True )
#Extracting the required columns from the dataframe
Grouped_School_Summary_Size = Grouped_School_Summary_Size [["Average Math Score" ,"Average Reading Score" ,
"% Passing Maths" ,"% Passing Reading" ,"% Overall Passing Rate" ]]
Grouped_School_Summary_Size
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
Average Math Score
Average Reading Score
% Passing Maths
% Passing Reading
% Overall Passing Rate
School Size
Small (<1000)
83.820
83.92500
93.55000
96.10000
94.8250
Medium (1000-2000)
83.374
83.86800
93.59800
96.79000
95.1920
Large (2000-5000)
77.745
81.34375
69.96375
82.76625
76.3675
Scores based on the School Type
#Scores by School Type
#Grouping based on School Type
Grouped_School_Summary_Type = New_School_Summary .groupby ("School Type" ,as_index = False ).mean ()
#Setting the index
Grouped_School_Summary_Type .set_index ("School Type" ,inplace = True )
#Extracting the required columns from the dataframe
Grouped_School_Summary_Type = Grouped_School_Summary_Type [["Average Math Score" ,"Average Reading Score" ,
"% Passing Maths" ,"% Passing Reading" ,"% Overall Passing Rate" ]]
Grouped_School_Summary_Type
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
Average Math Score
Average Reading Score
% Passing Maths
% Passing Reading
% Overall Passing Rate
School Type
Charter
83.472500
83.897500
93.620000
96.586250
95.102500
District
76.955714
80.965714
66.548571
80.798571
73.675714