Code Monkey home page Code Monkey logo

artificial-intelligence's Issues

Data Science Analistic

#Import the standard Python Scientific Libraries
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

#Import Plotly and use it in the Offline Mode
import plotly
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.tools as tls
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as fig_fact
plotly.tools.set_config_file(world_readable=True, sharing='public')

#Suppress Deprecation and Incorrect Usage Warnings
import warnings
warnings.filterwarnings('ignore')

mcq = pd.read_csv('multipleChoiceResponses.csv', encoding="ISO-8859-1", low_memory=False)
mcq.shape

#Load Free Form Responses into a Pandas DataFrame
ff = pd.read_csv('freeformResponses.csv', encoding="ISO-8859-1", low_memory=False)
ff.shape

sns.countplot(y='GenderSelect', data=mcq)

#Create a DataFrame for number of respondents by country
con_df = pd.DataFrame(mcq['Country'].value_counts())
con_df['country'] = con_df.index
con_df.columns = ['num_resp', 'country']
con_df = con_df.reset_index().drop('index', axis=1)
con_df.head(10)

#Create a Choropleth Map of the respondents using Plotly.
#Find out more at https://plot.ly/python/choropleth-maps/
data = [ dict(
type = 'choropleth',
locations = con_df['country'],
locationmode = 'country names',
z = con_df['num_resp'],
text = con_df['country'],
colorscale = [[0,'rgb(255, 255, 255)'],[1,'rgb(56, 142, 60)']],
autocolorscale = False,
reversescale = False,
marker = dict(
line = dict (
color = 'rgb(180,180,180)',
width = 0.5
) ),
colorbar = dict(
autotick = False,
tickprefix = '',
title = 'Survey Respondents'),
) ]
sns.countplot(y='LanguageRecommendationSelect', data=mcq)

layout = dict(
title = 'Survey Respondents by Nationality',
geo = dict(
showframe = False,
showcoastlines = False,
projection = dict(
type = 'Mercator'
)
)
)

fig = dict( data=data, layout=layout )
py.iplot( fig, validate=False, filename='survey-world-map')

#Get Summary Statistics of the Respndents' Ages.
mcq['Age'].describe()

#Plot the Age distribution
fig = fig_fact.create_distplot([mcq[mcq['Age'] > 0]['Age']], ['age'], colors=['#BA68C8'])
py.iplot(fig, filename='Basic Distplot')
#sns.distplot(mcq[mcq['Age'] > 0]['Age'])

sns.countplot(y='FormalEducation', data=mcq)

plt.figure(figsize=(6,8))
sns.countplot(y='MajorSelect', data=mcq)

sns.countplot(y='EmploymentStatus', data=mcq)

sns.countplot(y='Tenure', data=mcq)

sns.countplot(y='LanguageRecommendationSelect', data=mcq)

top_lang = mcq['LanguageRecommendationSelect'].value_counts()
top_lang_dist = []
for lang in top_lang.index:
top_lang_dist.append(mcq[(mcq['Age'].notnull()) & (mcq['LanguageRecommendationSelect'] == lang)]['Age'])

group_labels = top_lang.index

fig = fig_fact.create_distplot(top_lang_dist, group_labels, show_hist=False)
py.iplot(fig, filename='Language Preferences by Age')

mcq[mcq['CurrentJobTitleSelect'].notnull()]['CurrentJobTitleSelect'].shape

#Plot the number of R and Python users by Occupation
data = mcq[(mcq['CurrentJobTitleSelect'].notnull()) & ((mcq['LanguageRecommendationSelect'] == 'Python') | (mcq['LanguageRecommendationSelect'] == 'R'))]
plt.figure(figsize=(8, 10))
sns.countplot(y="CurrentJobTitleSelect", hue="LanguageRecommendationSelect", data=data)

#Render a bar plot of the 15 most popular ML Tools for next year
data = mcq['MLToolNextYearSelect'].value_counts().head(15)
sns.barplot(y=data.index, x=data)

data = mcq['MLMethodNextYearSelect'].value_counts().head(15)
sns.barplot(y=data.index, x=data)

#Explode the Pandas Dataframe to get the number of times each Learning Platform was mentioned
mcq['LearningPlatformSelect'] = mcq['LearningPlatformSelect'].astype('str').apply(lambda x: x.split(','))
s = mcq.apply(lambda x: pd.Series(x['LearningPlatformSelect']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'platform'

plt.figure(figsize=(6,8))
data = s[s != 'nan'].value_counts()
sns.barplot(y=data.index, x=data)

use_features = [x for x in mcq.columns if x.find('LearningPlatformUsefulness') != -1]

#Construct a Pandas DataFrame to illustrate the usefulness of various learning platforms.
fdf = {}
for feature in use_features:
a = mcq[feature].value_counts()
a = a/a.sum()
fdf[feature[len('LearningPlatformUsefulness'):]] = a

fdf = pd.DataFrame(fdf).transpose()#.sort_values('Very useful', ascending=False)

#Plot a Heatmap of Learning Platform Usefulness
plt.figure(figsize=(6,12))
sns.heatmap(fdf.sort_values("Very useful", ascending=False), annot=True)

#Plot a grouped barplot of Learning Platform Usefulness
fdf.plot(kind='bar', figsize=(18,8), title="Usefullness of Learning Platforms")
plt.show()

cat_features = [x for x in mcq.columns if x.find('LearningCategory') != -1]

cdf = {}
for feature in cat_features:
cdf[feature[len('LearningCategory'):]] = mcq[feature].mean()

cdf = pd.Series(cdf)

#Plot a Pie Chart of the contribution of each platform to learning
plt.pie(cdf, labels=cdf.index,
autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis('equal')
plt.title("Contribution of each Platform to Learning")
plt.show()

mcq[mcq['HardwarePersonalProjectsSelect'].notnull()]['HardwarePersonalProjectsSelect'].shape

mcq['HardwarePersonalProjectsSelect'] = mcq['HardwarePersonalProjectsSelect'].astype('str').apply(lambda x: x.split(','))
s = mcq.apply(lambda x: pd.Series(x['HardwarePersonalProjectsSelect']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'hardware'

s[s != 'nan'].value_counts()
plt.figure(figsize=(8,8))
sns.countplot(y='TimeSpentStudying', data=mcq, hue='EmploymentStatus').legend(loc='center left', bbox_to_anchor=(1, 0.5))

mcq['BlogsPodcastsNewslettersSelect'] = mcq['BlogsPodcastsNewslettersSelect'].astype('str').apply(lambda x: x.split(','))

s = mcq.apply(lambda x: pd.Series(x['BlogsPodcastsNewslettersSelect']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'platforms'

s = s[s != 'nan'].value_counts()

plt.figure(figsize=(6,8))
plt.title("Most Popular Blogs and Podcasts")
sns.barplot(y=s.index, x=s)

mcq['CoursePlatformSelect'] = mcq['CoursePlatformSelect'].astype('str').apply(lambda x: x.split(','))

t = mcq.apply(lambda x: pd.Series(x['CoursePlatformSelect']),axis=1).stack().reset_index(level=1, drop=True)
t.name = 'courses'

t = t[t != 'nan'].value_counts()

plt.title("Most Popular Course Platforms")
sns.barplot(y=t.index, x=t)

job_features = [x for x in mcq.columns if x.find('JobSkillImportance') != -1 and x.find('JobSkillImportanceOther') == -1]

#Get a Pandas DataFrame of Skill Importance of Data Science Jobs
jdf = {}
for feature in job_features:
a = mcq[feature].value_counts()
a = a/a.sum()
jdf[feature[len('JobSkillImportance'):]] = a
#fdf = pd.DataFrame(fdf)
jdf = pd.DataFrame(jdf).transpose()

jdf.plot(kind='bar', figsize=(12,6), title="Skill Importance in Data Science Jobs")

mcq[mcq['CompensationAmount'].notnull()].shape

#Convert all salary values to float. If not possible, convert it to NaN
def clean_salary(x):
x = x.replace(',', '')
try:
return float(x)
except:
return np.nan

#Function that outputs salary statistics and plots a salary distribution for that country
def salary_stats(country):
data = mcq[(mcq['CompensationAmount'].notnull()) & (mcq['Country'] == country) ]
data['CompensationAmount'] = data['CompensationAmount'].apply(clean_salary)
print(data[data['CompensationAmount'] < 1e9]['CompensationAmount'].describe())
sns.distplot(data[data['CompensationAmount'] < 1e9]['CompensationAmount'])

salary_stats('India')

salary_stats('United States')

mcq['PublicDatasetsSelect'] = mcq['PublicDatasetsSelect'].astype('str').apply(lambda x: x.split(','))

q = mcq.apply(lambda x: pd.Series(x['PublicDatasetsSelect']),axis=1).stack().reset_index(level=1, drop=True)
q.name = 'courses'

q = q[q != 'nan'].value_counts()

plt.title("Most Popular Dataset Platforms")
sns.barplot(y=q.index, x=q)

ff['PersonalProjectsChallengeFreeForm'].value_counts().head(15)

Implementation inaccuracies in the Local Search exercise

I suspect inaccuracies in the algorithm implementation from the Local Search exercise solutions.

  1. Beam search always checks the utility of the node against the utility of the problem which is never updated:
beam = [problem]
for t in range(self.epochs):
    neighborhood = chain(*(n.successors() for n in beam))  # union of all neighbors
    beam = sorted(neighborhood, key=lambda x: x.utility)[-self.beam_width:]
    # !problem is never updated!
    if all([node.utility < problem.utility for node in beam]): break
    return beam[-1]

Shouldn't we compare the beam utilities with a solution from a previous step?

  1. Late Acceptance Hill Climbing updates f to the utility of a candidate regardless of its being accepted or rejected:
f = [problem.utility] * self.Lfa
for i in range(self.epochs):
    _problem = max(problem.successors(), key=lambda x: x.utility)
    v = i % self.Lfa
    if _problem.utility >= f[v] or _problem.utility >= problem.utility: problem = _problem
    f[v] = _problem.utility # Shouldn't it be problem.utility?
    return problem

As far as I understand the pseudocode, f[v] should be set to problem.utility (not _problem.utility!). Thus, it stays the same in case the candidate was rejected.

Planning Project Layers Inclusion Testing

The layers implementation uses dictionaries for parents/children, so inclusion testing on those attributes maps back to the keys of the layer instead of the values. E.g., foo in literal_layer.children iterates over the literals in the current layer, not the children.

It may be better to make the children and parents private, and expose a read-only getter that requires a key to make the API more clear.

Broken link

There is a broken link here:

1. Follow the instructions in the classroom lesson to install and configure the `aind` [Anaconda](https://www.continuum.io/downloads) environment which includes several important packages that are used for the project. OS X or Unix/Linux users can activate the aind environment by running the following (Windows users simply run `activate aind`):

Jupyter kernel error in AIND-Constraint_Satisfaction.ipynb

Having created Miniconda environment from aind-universal-v3.yml, I am getting the following error when trying to open the AIND-Constraint_Satisfaction.ipynb notebook:

jupyter kernel ImportError: No module named 'win32api'

It looks like the environment file is missing the pywin32 extension.

Project 2 h_maxlevel test case cannot be solved

When testing h_maxlevel function, two conditions must be satisfied:

  1. goal is subset of literal
  2. there's no any two literals mutex
    But in fact, some test case cannot sasitfy two conditions above even if the graph is leveled.

Artificial Neural Network Churn Modelling

Importing the dataset

dataset = read.csv('Churn_Modelling.csv')
dataset = dataset[4:14]

Encoding the categorical variables as factors

dataset$Geography = as.numeric(factor(dataset$Geography,
levels = c('France', 'Spain', 'Germany'),
labels = c(1, 2, 3)))
dataset$Gender = as.numeric(factor(dataset$Gender,
levels = c('Female', 'Male'),
labels = c(1, 2)))

Splitting the dataset into the Training set and Test set

install.packages('caTools')

library(caTools)
set.seed(123)
split = sample.split(dataset$Exited, SplitRatio = 0.8)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

Feature Scaling

training_set[-11] = scale(training_set[-11])
test_set[-11] = scale(test_set[-11])

Fitting ANN to the Training set

install.packages('h2o')

library(h2o)
h2o.init(nthreads = -1)
model = h2o.deeplearning(y = 'Exited',
training_frame = as.h2o(training_set),
activation = 'Rectifier',
hidden = c(5,5),
epochs = 100,
train_samples_per_iteration = -2)

Predicting the Test set results

y_pred = h2o.predict(model, newdata = as.h2o(test_set[-11]))
y_pred = (y_pred > 0.5)
y_pred = as.vector(y_pred)

Making the Confusion Matrix

cm = table(test_set[, 11], y_pred)

h2o.shutdown()

neural network capim

setwd("C:/Users/Jose/Desktop")
data <- read.csv("dataCS.txt", head=TRUE, sep="\t ")

index <- sample(1:nrow(data),round(0.75*nrow(data)))
train <- data[index,]
test <- data[-index,]

lm.fit <- glm(Irri~., data=train)
summary(lm.fit)
pr.lm <- predict(lm.fit,test)
MSE.lm <- sum((pr.lm - test$Irri)^2)/nrow(test)

maxs <- apply(data, 2, max)
mins <- apply(data, 2, min)

scaled <- as.data.frame(scale(data, center = mins, scale = maxs - mins))

train_ <- scaled[index,]
test_ <- scaled[-index,]

library(neuralnet)
n <- names(train_)
f <- as.formula(paste("Irri ~", paste(n[!n %in% "Irri"], collapse = " + ")))
nn <- neuralnet(f,data=train_,hidden=c(5,3),linear.output=F)

plot(nn)

pr.nn <- compute(nn,test_[,1:4])

pr.nn_ <- pr.nn$net.result*(max(data$Irri)-min(data$Irri))+min(data$Irri)
test.r <- (test_$Irri)*(max(data$Irri)-min(data$Irri))+min(data$Irri)

MSE.nn <- sum((test.r - pr.nn_)^2)/nrow(test_)

print(paste(MSE.lm,MSE.nn))

par(mfrow=c(1,2))

#comparacion NN
plot(test$Irri,pr.nn_,col='red',main='Real vs predicted NN',pch=18,cex=0.7)
abline(0,1,lwd=2)
legend('bottomright',legend='NN',pch=18,col='red', bty='n')

#comparacion LN
plot(test$Irri,pr.lm,col='blue',main='Real vs predicted linear model',pch=18, cex=0.7)
abline(0,1,lwd=2)
legend('bottomright',legend='linear model',pch=18,col='blue', bty='n', cex=.95)

#comparacion
plot(test$Irri,pr.nn_,col='red',main='Real vs predicted NN',pch=18,cex=0.7)
points(test$Irri,pr.lm,col='blue',pch=18,cex=0.7)
abline(0,1,lwd=2)
legend('bottomright',legend=c('NN','LM'),pch=18,col=c('red','blue'))


library(boot)
set.seed(200)
lm.fit <- glm(Irri~.,data=data)
cv.glm(data,lm.fit,K=10)$delta[1]

set.seed(450)
cv.error <- NULL
k <- 10

library(plyr)
pbar <- create_progress_bar('text')
pbar$init(k)

for(i in 1:k){
index <- sample(1:nrow(data),round(0.9*nrow(data)))
train.cv <- scaled[index,]
test.cv <- scaled[-index,]

nn <- neuralnet(f,data=train.cv,hidden=c(5,3),linear.output=F)

pr.nn <- compute(nn,test.cv[,1:4])
pr.nn <- pr.nn$net.result*(max(data$Irri)-min(data$Irri))+min(data$Irri)

test.cv.r <- (test.cv$Irri)*(max(data$Irri)-min(data$Irri))+min(data$Irri)

cv.error[i] <- sum((test.cv.r - pr.nn)^2)/nrow(test.cv)

pbar$step()

}

mean(cv.error)

boxplot(cv.error,xlab='MSE CV',col='cyan',
border='blue',names='CV error (MSE)',
main='CV error (MSE) for NN',horizontal=TRUE)


library(plyr)
pbar <- create_progress_bar('text')
pbar$init(k)

for(i in 1:k){
index <- sample(1:nrow(data),round(0.9*nrow(data)))
train.cv <- scaled[index,]
test.cv <- scaled[-index,]

nn <- neuralnet(f,data=train.cv,hidden=c(5,2),linear.output=T)

pr.nn <- compute(nn,test.cv[,1:4])
pr.nn <- pr.nn$net.result*(max(data$Irri)-min(data$Irri))+min(data$Irri)

test.cv.r <- (test.cv$Irri)*(max(data$Irri)-min(data$Irri))+min(data$Irri)

cv.error[i] <- sum((test.cv.r - pr.nn)^2)/nrow(test.cv)

pbar$step()

}

mean(cv.error)

boxplot(cv.error,xlab='MSE CV',col='cyan',
border='blue',names='CV error (MSE)',
main='CV error (MSE) for NN',horizontal=TRUE)

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    ๐Ÿ–– Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. ๐Ÿ“Š๐Ÿ“ˆ๐ŸŽ‰

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google โค๏ธ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.