udacity / artificial-intelligence Goto Github PK

License: MIT License

Jupyter Notebook 40.22% Python 59.78%

artificial-intelligence's Issues

Data Science Analistic

#Import the standard Python Scientific Libraries
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

#Import Plotly and use it in the Offline Mode
import plotly
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.tools as tls
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as fig_fact
plotly.tools.set_config_file(world_readable=True, sharing='public')

#Suppress Deprecation and Incorrect Usage Warnings
import warnings
warnings.filterwarnings('ignore')

mcq = pd.read_csv('multipleChoiceResponses.csv', encoding="ISO-8859-1", low_memory=False)
mcq.shape

#Load Free Form Responses into a Pandas DataFrame
ff = pd.read_csv('freeformResponses.csv', encoding="ISO-8859-1", low_memory=False)
ff.shape

sns.countplot(y='GenderSelect', data=mcq)

#Create a DataFrame for number of respondents by country
con_df = pd.DataFrame(mcq['Country'].value_counts())
con_df['country'] = con_df.index
con_df.columns = ['num_resp', 'country']
con_df = con_df.reset_index().drop('index', axis=1)
con_df.head(10)

#Create a Choropleth Map of the respondents using Plotly.
#Find out more at https://plot.ly/python/choropleth-maps/
data = [ dict(
type = 'choropleth',
locations = con_df['country'],
locationmode = 'country names',
z = con_df['num_resp'],
text = con_df['country'],
colorscale = [[0,'rgb(255, 255, 255)'],[1,'rgb(56, 142, 60)']],
autocolorscale = False,
reversescale = False,
marker = dict(
line = dict (
color = 'rgb(180,180,180)',
width = 0.5
) ),
colorbar = dict(
autotick = False,
tickprefix = '',
title = 'Survey Respondents'),
) ]
sns.countplot(y='LanguageRecommendationSelect', data=mcq)

layout = dict(
title = 'Survey Respondents by Nationality',
geo = dict(
showframe = False,
showcoastlines = False,
projection = dict(
type = 'Mercator'
)
)
)

fig = dict( data=data, layout=layout )
py.iplot( fig, validate=False, filename='survey-world-map')

#Get Summary Statistics of the Respndents' Ages.
mcq['Age'].describe()

#Plot the Age distribution
fig = fig_fact.create_distplot([mcq[mcq['Age'] > 0]['Age']], ['age'], colors=['#BA68C8'])
py.iplot(fig, filename='Basic Distplot')
#sns.distplot(mcq[mcq['Age'] > 0]['Age'])

sns.countplot(y='FormalEducation', data=mcq)

plt.figure(figsize=(6,8))
sns.countplot(y='MajorSelect', data=mcq)

sns.countplot(y='EmploymentStatus', data=mcq)

sns.countplot(y='Tenure', data=mcq)

sns.countplot(y='LanguageRecommendationSelect', data=mcq)

top_lang = mcq['LanguageRecommendationSelect'].value_counts()
top_lang_dist = []
for lang in top_lang.index:
top_lang_dist.append(mcq[(mcq['Age'].notnull()) & (mcq['LanguageRecommendationSelect'] == lang)]['Age'])

group_labels = top_lang.index

fig = fig_fact.create_distplot(top_lang_dist, group_labels, show_hist=False)
py.iplot(fig, filename='Language Preferences by Age')

mcq[mcq['CurrentJobTitleSelect'].notnull()]['CurrentJobTitleSelect'].shape

#Plot the number of R and Python users by Occupation
data = mcq[(mcq['CurrentJobTitleSelect'].notnull()) & ((mcq['LanguageRecommendationSelect'] == 'Python') | (mcq['LanguageRecommendationSelect'] == 'R'))]
plt.figure(figsize=(8, 10))
sns.countplot(y="CurrentJobTitleSelect", hue="LanguageRecommendationSelect", data=data)

#Render a bar plot of the 15 most popular ML Tools for next year
data = mcq['MLToolNextYearSelect'].value_counts().head(15)
sns.barplot(y=data.index, x=data)

data = mcq['MLMethodNextYearSelect'].value_counts().head(15)
sns.barplot(y=data.index, x=data)

#Explode the Pandas Dataframe to get the number of times each Learning Platform was mentioned
mcq['LearningPlatformSelect'] = mcq['LearningPlatformSelect'].astype('str').apply(lambda x: x.split(','))
s = mcq.apply(lambda x: pd.Series(x['LearningPlatformSelect']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'platform'

plt.figure(figsize=(6,8))
data = s[s != 'nan'].value_counts()
sns.barplot(y=data.index, x=data)

use_features = [x for x in mcq.columns if x.find('LearningPlatformUsefulness') != -1]

#Construct a Pandas DataFrame to illustrate the usefulness of various learning platforms.
fdf = {}
for feature in use_features:
a = mcq[feature].value_counts()
a = a/a.sum()
fdf[feature[len('LearningPlatformUsefulness'):]] = a

fdf = pd.DataFrame(fdf).transpose()#.sort_values('Very useful', ascending=False)

#Plot a Heatmap of Learning Platform Usefulness
plt.figure(figsize=(6,12))
sns.heatmap(fdf.sort_values("Very useful", ascending=False), annot=True)

#Plot a grouped barplot of Learning Platform Usefulness
fdf.plot(kind='bar', figsize=(18,8), title="Usefullness of Learning Platforms")
plt.show()

cat_features = [x for x in mcq.columns if x.find('LearningCategory') != -1]

cdf = {}
for feature in cat_features:
cdf[feature[len('LearningCategory'):]] = mcq[feature].mean()

cdf = pd.Series(cdf)

#Plot a Pie Chart of the contribution of each platform to learning
plt.pie(cdf, labels=cdf.index,
autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis('equal')
plt.title("Contribution of each Platform to Learning")
plt.show()

mcq[mcq['HardwarePersonalProjectsSelect'].notnull()]['HardwarePersonalProjectsSelect'].shape

mcq['HardwarePersonalProjectsSelect'] = mcq['HardwarePersonalProjectsSelect'].astype('str').apply(lambda x: x.split(','))
s = mcq.apply(lambda x: pd.Series(x['HardwarePersonalProjectsSelect']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'hardware'

s[s != 'nan'].value_counts()
plt.figure(figsize=(8,8))
sns.countplot(y='TimeSpentStudying', data=mcq, hue='EmploymentStatus').legend(loc='center left', bbox_to_anchor=(1, 0.5))

mcq['BlogsPodcastsNewslettersSelect'] = mcq['BlogsPodcastsNewslettersSelect'].astype('str').apply(lambda x: x.split(','))

s = mcq.apply(lambda x: pd.Series(x['BlogsPodcastsNewslettersSelect']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'platforms'

s = s[s != 'nan'].value_counts()

plt.figure(figsize=(6,8))
plt.title("Most Popular Blogs and Podcasts")
sns.barplot(y=s.index, x=s)

mcq['CoursePlatformSelect'] = mcq['CoursePlatformSelect'].astype('str').apply(lambda x: x.split(','))

t = mcq.apply(lambda x: pd.Series(x['CoursePlatformSelect']),axis=1).stack().reset_index(level=1, drop=True)
t.name = 'courses'

t = t[t != 'nan'].value_counts()

plt.title("Most Popular Course Platforms")
sns.barplot(y=t.index, x=t)

job_features = [x for x in mcq.columns if x.find('JobSkillImportance') != -1 and x.find('JobSkillImportanceOther') == -1]

#Get a Pandas DataFrame of Skill Importance of Data Science Jobs
jdf = {}
for feature in job_features:
a = mcq[feature].value_counts()
a = a/a.sum()
jdf[feature[len('JobSkillImportance'):]] = a
#fdf = pd.DataFrame(fdf)
jdf = pd.DataFrame(jdf).transpose()

jdf.plot(kind='bar', figsize=(12,6), title="Skill Importance in Data Science Jobs")

mcq[mcq['CompensationAmount'].notnull()].shape

#Convert all salary values to float. If not possible, convert it to NaN
def clean_salary(x):
x = x.replace(',', '')
try:
return float(x)
except:
return np.nan

#Function that outputs salary statistics and plots a salary distribution for that country
def salary_stats(country):
data = mcq[(mcq['CompensationAmount'].notnull()) & (mcq['Country'] == country) ]
data['CompensationAmount'] = data['CompensationAmount'].apply(clean_salary)
print(data[data['CompensationAmount'] < 1e9]['CompensationAmount'].describe())
sns.distplot(data[data['CompensationAmount'] < 1e9]['CompensationAmount'])

salary_stats('India')

salary_stats('United States')

mcq['PublicDatasetsSelect'] = mcq['PublicDatasetsSelect'].astype('str').apply(lambda x: x.split(','))

q = mcq.apply(lambda x: pd.Series(x['PublicDatasetsSelect']),axis=1).stack().reset_index(level=1, drop=True)
q.name = 'courses'

q = q[q != 'nan'].value_counts()

plt.title("Most Popular Dataset Platforms")
sns.barplot(y=q.index, x=q)

ff['PersonalProjectsChallengeFreeForm'].value_counts().head(15)

Add test case to ensure fixed POV heuristic evaluation

The heuristic score function should always be called from the POV of the player that initiates the search.

Implementation inaccuracies in the Local Search exercise

I suspect inaccuracies in the algorithm implementation from the Local Search exercise solutions.

Beam search always checks the utility of the node against the utility of the problem which is never updated:

beam = [problem]
for t in range(self.epochs):
    neighborhood = chain(*(n.successors() for n in beam))  # union of all neighbors
    beam = sorted(neighborhood, key=lambda x: x.utility)[-self.beam_width:]
    # !problem is never updated!
    if all([node.utility < problem.utility for node in beam]): break
    return beam[-1]

Shouldn't we compare the beam utilities with a solution from a previous step?

Late Acceptance Hill Climbing updates f to the utility of a candidate regardless of its being accepted or rejected:

f = [problem.utility] * self.Lfa
for i in range(self.epochs):
    _problem = max(problem.successors(), key=lambda x: x.utility)
    v = i % self.Lfa
    if _problem.utility >= f[v] or _problem.utility >= problem.utility: problem = _problem
    f[v] = _problem.utility # Shouldn't it be problem.utility?
    return problem

As far as I understand the pseudocode, f[v] should be set to problem.utility (not _problem.utility!). Thus, it stays the same in case the candidate was rejected.

Planning Project Layers Inclusion Testing

The layers implementation uses dictionaries for parents/children, so inclusion testing on those attributes maps back to the keys of the layer instead of the values. E.g., foo in literal_layer.children iterates over the literals in the current layer, not the children.

It may be better to make the children and parents private, and expose a read-only getter that requires a key to make the API more clear.

Broken link

There is a broken link here:

artificial-intelligence/Projects/1_Sudoku/README.md

Line 12 in ce2698d

    
           1. Follow the instructions in the classroom lesson to install and configure the `aind` [Anaconda](https://www.continuum.io/downloads) environment which includes several important packages that are used for the project. OS X or Unix/Linux users can activate the aind environment by running the following (Windows users simply run `activate aind`):

Hello world

Project 3 README referring wrong folder and file

In the README:

the folder name is 3_Game Playing, but should be 3_Adversarial Search
the file name is game_agent.py, but should be my_customer_player.py.

Incomplete sentence in Project 3 readme

https://github.com/udacity/artificial-intelligence/tree/master/Projects/3_Adversarial%20Search#the-get_action-method

README in planning search references wrong test script

The README file instructions for Part 2 mention that in order to test the code one should execute python -m unittest tests.test_my_pddl while it should be python -m unittest tests.test_my_air_cargo_problems

AIND-Local Search.ipynb doesn't load

https://github.com/udacity/artificial-intelligence/blob/solutions/Exercises/3_Local%20Search/AIND-Local%20Search.ipynb

README referencing wrong file

This line:

5a146b5#diff-5731b1a6969a827c3a4f61a630b9eb92R30

states that OpenMoveEval, ImprovedEval andNullEval are defined in game_agent.py but they currently are in sample_players.py

Adversarial Search Test Class Name

The game playing agent test class should not reference "Project 1".

Jupyter kernel error in AIND-Constraint_Satisfaction.ipynb

Having created Miniconda environment from aind-universal-v3.yml, I am getting the following error when trying to open the AIND-Constraint_Satisfaction.ipynb notebook:

jupyter kernel ImportError: No module named 'win32api'

It looks like the environment file is missing the pywin32 extension.

Project 2 h_maxlevel test case cannot be solved

When testing h_maxlevel function, two conditions must be satisfied:

goal is subset of literal
there's no any two literals mutex
But in fact, some test case cannot sasitfy two conditions above even if the graph is leveled.

Artificial Neural Network Churn Modelling

Importing the dataset

dataset = read.csv('Churn_Modelling.csv')
dataset = dataset[4:14]

Encoding the categorical variables as factors

dataset$Geography = as.numeric(factor(dataset$Geography,
levels = c('France', 'Spain', 'Germany'),
labels = c(1, 2, 3)))
dataset$Gender = as.numeric(factor(dataset$Gender,
levels = c('Female', 'Male'),
labels = c(1, 2)))

Splitting the dataset into the Training set and Test set

install.packages('caTools')

library(caTools)
set.seed(123)
split = sample.split(dataset$Exited, SplitRatio = 0.8)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

Feature Scaling

training_set[-11] = scale(training_set[-11])
test_set[-11] = scale(test_set[-11])

Fitting ANN to the Training set

install.packages('h2o')

library(h2o)
h2o.init(nthreads = -1)
model = h2o.deeplearning(y = 'Exited',
training_frame = as.h2o(training_set),
activation = 'Rectifier',
hidden = c(5,5),
epochs = 100,
train_samples_per_iteration = -2)

Predicting the Test set results

y_pred = h2o.predict(model, newdata = as.h2o(test_set[-11]))
y_pred = (y_pred > 0.5)
y_pred = as.vector(y_pred)

Making the Confusion Matrix

cm = table(test_set[, 11], y_pred)

h2o.shutdown()

neural network capim

setwd("C:/Users/Jose/Desktop")
data <- read.csv("dataCS.txt", head=TRUE, sep="\t ")

index <- sample(1:nrow(data),round(0.75*nrow(data)))
train <- data[index,]
test <- data[-index,]

lm.fit <- glm(Irri~., data=train)
summary(lm.fit)
pr.lm <- predict(lm.fit,test)
MSE.lm <- sum((pr.lm - test$Irri)^2)/nrow(test)

maxs <- apply(data, 2, max)
mins <- apply(data, 2, min)

scaled <- as.data.frame(scale(data, center = mins, scale = maxs - mins))

train_ <- scaled[index,]
test_ <- scaled[-index,]

library(neuralnet)
n <- names(train_)
f <- as.formula(paste("Irri ~", paste(n[!n %in% "Irri"], collapse = " + ")))
nn <- neuralnet(f,data=train_,hidden=c(5,3),linear.output=F)

plot(nn)

pr.nn <- compute(nn,test_[,1:4])

pr.nn_ <- pr.nn$net.result*(max(data$Irri)-min(data$Irri))+min(data$Irri)
test.r <- (test_$Irri)*(max(data$Irri)-min(data$Irri))+min(data$Irri)

MSE.nn <- sum((test.r - pr.nn_)^2)/nrow(test_)

print(paste(MSE.lm,MSE.nn))

par(mfrow=c(1,2))

#comparacion NN
plot(test$Irri,pr.nn_,col='red',main='Real vs predicted NN',pch=18,cex=0.7)
abline(0,1,lwd=2)
legend('bottomright',legend='NN',pch=18,col='red', bty='n')

#comparacion LN
plot(test$Irri,pr.lm,col='blue',main='Real vs predicted linear model',pch=18, cex=0.7)
abline(0,1,lwd=2)
legend('bottomright',legend='linear model',pch=18,col='blue', bty='n', cex=.95)

#comparacion
plot(test$Irri,pr.nn_,col='red',main='Real vs predicted NN',pch=18,cex=0.7)
points(test$Irri,pr.lm,col='blue',pch=18,cex=0.7)
abline(0,1,lwd=2)
legend('bottomright',legend=c('NN','LM'),pch=18,col=c('red','blue'))

library(boot)
set.seed(200)
lm.fit <- glm(Irri~.,data=data)
cv.glm(data,lm.fit,K=10)$delta[1]

set.seed(450)
cv.error <- NULL
k <- 10

library(plyr)
pbar <- create_progress_bar('text')
pbar$init(k)

for(i in 1:k){
index <- sample(1:nrow(data),round(0.9*nrow(data)))
train.cv <- scaled[index,]
test.cv <- scaled[-index,]

nn <- neuralnet(f,data=train.cv,hidden=c(5,3),linear.output=F)

pr.nn <- compute(nn,test.cv[,1:4])
pr.nn <- pr.nn$net.result*(max(data$Irri)-min(data$Irri))+min(data$Irri)

test.cv.r <- (test.cv$Irri)*(max(data$Irri)-min(data$Irri))+min(data$Irri)

cv.error[i] <- sum((test.cv.r - pr.nn)^2)/nrow(test.cv)

pbar$step()

}

mean(cv.error)

boxplot(cv.error,xlab='MSE CV',col='cyan',
border='blue',names='CV error (MSE)',
main='CV error (MSE) for NN',horizontal=TRUE)