Você está na página 1de 13

Rcommendation system

import pandas as pd

import numpy as np

ratings_data = pd.read_csv('C:\Users\DELL\Anaconda2\sar.csv')

ratings_data.head()

movie_names = pd.read_csv("C:\Users\DELL\Anaconda2\movies.csv")

movie_names.head()

movie_data = pd.merge(ratings_data, movie_names, on='movieId')

movie_data.head()

movie_data.groupby('title')['rating'].mean().head()

movie_data.groupby('title')['rating'].mean().sort_values(ascending=False).head()

movie_data.groupby('title')['rating'].count().sort_values(ascending=False).head()

ratings_mean_count = pd.DataFrame(movie_data.groupby('title')['rating'].mean())

ratings_mean_count['rating_counts'] = pd.DataFrame(movie_data.groupby('title')['rating'].count())

import matplotlib.pyplot as plt

import seaborn as sns

sns.set_style('dark')

plt.figure(figsize=(8,6))

plt.rcParams['patch.force_edgecolor'] = True

ratings_mean_count['rating_counts'].hist(bins=50)

plt.figure(figsize=(8,6))

plt.rcParams['patch.force_edgecolor'] = True

sns.jointplot(x='rating', y='rating_counts', data=ratings_mean_count,

alpha=0.4)

user_movie_rating = movie_data.pivot_table(index='userId', columns='title',

values='rating')

user_movie_rating.head()

forrest_gump_ratings = user_movie_rating['Forrest Gump (1994)']

forrest_gump_ratings.head()

movies_like_forest_gump = user_movie_rating.corrwith(forrest_gump_ratings)
corr_forrest_gump = pd.DataFrame(movies_like_forest_gump,

columns=['Correlation'])

corr_forrest_gump.dropna(inplace=True)

corr_forrest_gump.head()

corr_forrest_gump.sort_values('Correlation', ascending=False).head(10)

corr_forrest_gump = corr_forrest_gump.join(ratings_mean_count['rating_counts'])

corr_forrest_gump.head()

corr_forrest_gump[corr_forrest_gump

['rating_counts']>50].sort_values('Correlation', ascending=False).head()

Page Rank

import numpy as np

a = np.array([[0,0,0,0,0,0,0,0,0,0,0,0],[0,1,0,0,0,0,0,0,0,0,0,0],[0,0,0,1/2,1/2,0,0,0,0,0,0,0],

[0,0,0,0,1,0,0,0,0,0,0,0],[0,0,0,0,0,1,0,0,0,1,0,0],

[0,0,0,0,0,0,1,0,0,0,0,0],[0,0,0,0,0,0,0,2/3,2/3,0,0,2/3],

[0,0,0,0,0,0,0,0,1,0,0,0],[0,0,0,0,0,0,0,0,0,2/3,2/3,2/3],

[0,1,0,0,0,0,0,0,0,0,0,0],[0,0,0,0,0,0,1,0,0,0,0,1],

[0,0,0,0,0,0,0,0,0,0,0,0]])

n = 15

d = 0.85

d_matrix = np.array([[d]]*a.shape[1])

a_trans = a.T

for i in range(n):

a1 = np.matmul(a_trans, d_matrix)

d_matrix = a1

print('Page Ranks for matrix of size ' + str(a.shape[0]))

print(d_matrix)

2.

import numpy as np

a = np.array([[0,0,0.5,0,0,0,0,0.5,0,0],[0.5,0,0,0,0,0,0.5,0,0,0],[0,0,0,1,0,0,0,0,0,0],

[0,0,0,0,0,1,0,0,0,0],[0.5,0,0,0,0,0,0,0,0.5,0],
[0,0,0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0,0,1],

[0,0,0,0,0,0,0,0,1,0],

[0,0,0,0,0,1,0,0,0,0],[0,0,0,0,0,0,0,1,0,0]])

n = 15

d = 0.85

d_matrix = np.array([[d]]*a.shape[1])

a_trans = a.T

for i in range(n):

a1 = np.matmul(a_trans, d_matrix)

d_matrix = a1

print('Page Ranks for matrix of size ' + str(a.shape[0]))

print(d_matrix)

k-means

import pandas as pd

import pylab as pl

from sklearn.cluster import KMeans

from sklearn.decomposition import PCA

variables = pd.read_csv('C:\Users\DELL\Anaconda2\sar.csv')

Y = variables[['movieId']]

X = variables[['rating']]

StartJunction = variables[['timestamp']]

EndJunction = variables[['userId']]

X_norm = (X -X.mean()) / (X.max() -X.min())

Y_norm = (Y -Y.mean()) / (Y.max() -Y.min())

Nc = range(1, 20)

kmeans = [KMeans(n_clusters=i) for i in Nc]

kmeans

score = [kmeans[i].fit(Y_norm).score(Y_norm) for i in range(len(kmeans))]

score

pl.plot(Nc,score)
pl.xlabel('Number of Clusters')

pl.ylabel('Score')

pl.title('Elbow Curve')

pca = PCA(n_components=1).fit(Y_norm)

pca_d = pca.transform(Y_norm)

pca_c = pca.transform(X_norm)

kmeans=KMeans(n_clusters=3)

kmeansoutput=kmeans.fit(Y_norm)

kmeansoutput

pl.figure('3 Cluster K-Means')

pl.scatter(pca_d[:, 0], pca_c[:, 0], c=kmeansoutput.labels_)

pl.xlabel('Rating')

pl.ylabel('UserId')

pl.title('3 Cluster K-Means')

pl.show()

Agglomerative clustering

import numpy as np

from sklearn.cluster import MeanShift# as ms

from sklearn.datasets.samples_generator import make_blobs

import matplotlib.pyplot as plt

from mpl_toolkits.mplot3d import Axes3D

from matplotlib import style

style.use("ggplot")

centers = [[1,1,1],[5,5,5],[3,10,10]]

X, _ = make_blobs(n_samples = 500, centers = centers, cluster_std = 1.5)

ms = MeanShift()

ms.fit(X)

labels = ms.labels_

cluster_centers = ms.cluster_centers_

print(cluster_centers)

n_clusters_ = len(np.unique(labels))
print("Number of estimated clusters:", n_clusters_)

colors = 10*['r','g','b','c','k','y','m']

print(colors)

print(labels)

fig = plt.figure()

ax = fig.add_subplot(111, projection='3d')

for i in range(len(X)):

ax.scatter(X[i][0], X[i][1], X[i][2], c=colors[labels[i]], marker='o')

ax.scatter(cluster_centers[:,0],cluster_centers[:,1],cluster_centers[:,2],

marker="x",color='k', s=150, linewidths = 5, zorder=10)

plt.show()

Divisive Clustering

import numpy as np;

import pandas as pd

num_clusters = 0

mat = np.array([[0,2,6,10,9],[2,0,5,9,8],[6,5,0,4,5],[10,9,4,0,3],[9,8,5,3,0]])

all_elements = ['a','b','c','d','e']

dissimilarity_matrix = pd.DataFrame(mat,index=all_elements, columns=all_elements)

def avg_dissim_within_group_element(ele, element_list):

max_diameter = -np.inf

sum_dissm = 0

for i in element_list:

sum_dissm += dissimilarity_matrix[ele][i]

if( dissimilarity_matrix[ele][i] > max_diameter):

max_diameter = dissimilarity_matrix[ele][i]

if(len(element_list)>1):

avg = sum_dissm/(len(element_list)-1)

else:

avg = 0

return avg
def avg_dissim_across_group_element(ele, main_list, splinter_list):

if len(splinter_list) == 0:

return 0

sum_dissm = 0

for j in splinter_list:

sum_dissm = sum_dissm + dissimilarity_matrix[ele][j]

avg = sum_dissm/(len(splinter_list))

return avg

def splinter(main_list, splinter_group):

most_dissm_object_value = -np.inf

most_dissm_object_index = None

for ele in main_list:

x = avg_dissim_within_group_element(ele, main_list)

y = avg_dissim_across_group_element(ele, main_list, splinter_group)

diff= x -y

if diff > most_dissm_object_value:

most_dissm_object_value = diff

most_dissm_object_index = ele

if(most_dissm_object_value>0):

return (most_dissm_object_index, 1)

else:

return (-1, -1)

def split(element_list):

main_list = element_list

splinter_group = []

(most_dissm_object_index,flag) = splinter(main_list, splinter_group)

while(flag > 0):

main_list.remove(most_dissm_object_index)
splinter_group.append(most_dissm_object_index)

(most_dissm_object_index,flag) = splinter(element_list, splinter_group)

return (main_list, splinter_group)

def max_diameter(cluster_list):

max_diameter_cluster_index = None

max_diameter_cluster_value = -np.inf

index = 0

for element_list in cluster_list:

for i in element_list:

for j in element_list:

if dissimilarity_matrix[i][j] > max_diameter_cluster_value:

max_diameter_cluster_value = dissimilarity_matrix[i][j]

max_diameter_cluster_index = index

index +=1

if(max_diameter_cluster_value <= 0):

return -1

return max_diameter_cluster_index

current_clusters = ([all_elements])

level = 1

index = 0

while(index!=-1):

print(level, current_clusters)

(a_clstr, b_clstr) = split(current_clusters[index])

del current_clusters[index]

current_clusters.append(a_clstr)

current_clusters.append(b_clstr)

index = max_diameter(current_clusters)

level +=1
print(level, current_clusters)

Data Cleaning And Dara Preprocessing

import pandas as pd

import numpy as np

sale2=pd.read_csv("C:\Users\DELL\Anaconda2\sale2.csv", sep=",", header=0, index_col=0)

sale2.info()

sale2

sale2.isnull().sum()

mean=sale2['Order Quantity'].mean()

mean

sale2['Order Quantity']=sale2['Order Quantity'].fillna(mean)

sale2.isnull().sum()

sale2['Sales']=sale2['Sales'].fillna(mean)

sale2.info()

sale2.isnull().sum()

sale2['profit']=sale2['profit'].fillna(mean)

sale2.isnull().sum()

sale2

Preprocessing

from sklearn import preprocessing

std_scale = preprocessing.StandardScaler().fit(sale2[['Order Quantity', 'Sales']])

sale2_std = std_scale.transform(sale2[['Order Quantity', 'Sales']])

minmax_scale = preprocessing.MinMaxScaler().fit(sale2[['Order Quantity', 'Sales']])

sale2_minmax = minmax_scale.transform(sale2[['Order Quantity', 'Sales']])

print('Mean after standardization:\nOrder Quantity={:.2f}, Sales={:.2f}'

.format(sale2_std[:,0].mean(), sale2_std[:,1].mean()))

print('\nStandard deviation after standardization:\nOrder Quantity={:.2f}, Sales={:.2f}'

.format(sale2_std[:,0].std(), sale2_std[:,1].std()))

print('Min-value after min-max scaling:\nOrder Quantity={:.2f}, Sales={:.2f}'


.format(sale2_minmax[:,0].min(), sale2_minmax[:,1].min()))

print('\nMax-value after min-max scaling:\nOrder Quantity={:.2f}, Sales={:.2f}'

.format(sale2_minmax[:,0].max(), sale2_minmax[:,1].max()))

dec=sale2['Sales'].mean()

Decimal Scaling

sar=dec/10000

sar

dec=sale2['Sales']

dec1=salae2['Order Quantity']
sar=dec/10000

sar1=dec1/100

sar #decimal Scaling

sar1

Apriori

from mlxtend.frequent_patterns import apriori


dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)
df
from mlxtend.frequent_patterns import apriori

apriori(df, min_support=0.6)
apriori(df, min_support=0.6, use_colnames=True)

Selecting frequent itemsets


frequent_itemsets[ (frequent_itemsets['length'] == 2) &
(frequent_itemsets['support'] >= 0.8) ]
frequent_itemsets[ frequent_itemsets['itemsets'] == {'Onion', 'Eggs'} ]

Decision Tree

def createTree(dataSet, minSup=1): #create FP-tree from dataset but don't mine

headerTable = {}

#go over dataSet twice


for trans in dataSet:#first pass counts frequency of occurance

for item in trans:

headerTable[item] = headerTable.get(item, 0) + dataSet[trans]

for k in list(headerTable): #remove items not meeting minSup

if headerTable[k] < minSup:

del(headerTable[k])

freqItemSet = set(headerTable.keys())

#print 'freqItemSet: ',freqItemSet

if len(freqItemSet) == 0: return None, None #if no items meet min support -->get out

for k in headerTable:

headerTable[k] = [headerTable[k], None] #reformat headerTable to use Node link

#print 'headerTable: ',headerTable

retTree = treeNode('Null Set', 1, None) #create tree

for tranSet, count in dataSet.items(): #go through dataset 2nd time

localD = {}

for item in tranSet: #put transaction items in order

if item in freqItemSet:

localD[item] = headerTable[item][0]

if len(localD) > 0:

orderedItems = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)]

updateTree(orderedItems, retTree, headerTable, count)#populate tree with ordered freq


itemset

return retTree, headerTable #return tree and header table

def updateTree(items, inTree, headerTable, count):

if items[0] in inTree.children:#check if orderedItems[0] in retTree.children

inTree.children[items[0]].inc(count) #incrament count

else: #add items[0] to inTree.children

inTree.children[items[0]] = treeNode(items[0], count, inTree)

if headerTable[items[0]][1] == None: #update header table

headerTable[items[0]][1] = inTree.children[items[0]]

else:
updateHeader(headerTable[items[0]][1], inTree.children[items[0]])

if len(items) > 1:#call updateTree() with remaining ordered items

updateTree(items[1::], inTree.children[items[0]], headerTable, count)

def updateHeader(nodeToTest, targetNode): #this version does not use recursion

while (nodeToTest.nodeLink != None): #Do not use recursion to traverse a linked list!

nodeToTest = nodeToTest.nodeLink

nodeToTest.nodeLink = targetNode

def loadSimpDat():

simpDat = [['r', 'z', 'h', 'j', 'p'],

['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],

['z'],

['r', 'x', 'n', 'o', 's'],

['y', 'r', 'x', 'z', 'q', 't', 'p'],

['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]

return simpDat

def createInitSet(dataSet):

retDict = {}

for trans in dataSet:

retDict[frozenset(trans)] = 1

return retDict

simpDat = loadSimpDat()

simpDat

initSet = createInitSet(simpDat)

initSet
dataset = pandas.read_csv(url, names=names)

data = load_iris()

>>> data.target[[10, 25, 50]]

array([0, 0, 1])

>>> list(data.target_names)

['setosa', 'versicolor', 'virginica']

Load and return the boston house-prices


load_boston([return_X_y])
dataset (regression).
load_iris([return_X_y]) Load and return the iris dataset (classification).
Load and return the diabetes dataset
load_diabetes([return_X_y])
(regression).
load_digits([n_class, return_X_y]) Load and return the digits dataset
(classification).
Load and return the linnerud dataset
load_linnerud([return_X_y])
(multivariate regression).
Load and return the wine dataset
load_wine([return_X_y])
(classification).
load_breast_cancer([return_X_y]) Load and return the breast cancer wisconsin
dataset (classification).
Decision tree
# Load libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree
Classifier
from sklearn.model_selection import train_test_split # Import
train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for
accuracy calculation
col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi',
'pedigree', 'age', 'label']
# load dataset
pima = pd.read_csv("pima-indians-diabetes.csv", header=None,
names=col_names)
pima.head()
#split dataset in features and target variable
feature_cols = ['pregnant', 'insulin', 'bmi',
'age','glucose','bp','pedigree']
X = pima[feature_cols] # Features
y = pima.label # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
random_state=1) # 70% training and 30% test
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer


clf = clf.fit(X_train,y_train)

#Predict the response for test dataset


y_pred = clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
from IPython.display import Image
import pydotplus

dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,
filled=True, rounded=True,
special_characters=True,feature_names =
feature_cols,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('diabetes.png')
Image(graph.create_png())

Você também pode gostar