Initial data analysis functions

Create a function in R:

Function function() allows us to create specific functions. Between parenthesis we put the name of the parameters we want to create to be used in our function.

Function(parameter1, parameter2, ...){
statements
return(object) }

NameFunction0 = function(x,y){
  return((x+y)/2)
}
NameFunction0(4,5)

## [1] 4.5

If we want use a parameter that may or may not be specified we use parameter = NULL:

NameFunction1 = function(x,y = NULL) {   
  if(!is.null(y)) x = x+y
  return(x)
}
NameFunction1(5,10) #with 2 paramers (x,y)

## [1] 15

NameFunction1(5) #with only only parameter (x)

## [1] 5

Initial Data Analysis Functions

Here we have created some basic functions that can be used for initial data analysis of our data:

INLIERS returns the values of a vector that are between the vales start and end:

INLIERS = function(x,start,end){
    x1 = x[x>start & x<end]
    x2 = sort(x1)
return(x2)}

x = c(1:100, NA, NA)
x

##   [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
##  [18]  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34
##  [35]  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51
##  [52]  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68
##  [69]  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85
##  [86]  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100  NA  NA

INLIERS(x, 2,55)

##  [1]  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [24] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
## [47] 49 50 51 52 53 54

OUTLIERS returns the values of a vector that are outside the vales start and end:

OUTLIERS = function(x,s,e){
  n = x[is.na(x) == TRUE]
  x1 = x[x<s | x>e]
  x2 = sort(x1)
  return(c(x2,n))
}

OUTLIERS(x, 2,55)

##  [1]   1  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
## [18]  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88
## [35]  89  90  91  92  93  94  95  96  97  98  99 100  NA  NA

The following functions OUTMatrix() and INMatrix() do the same as the funcitons INLIERS() and OUTLIERS() but instead of working with a vector, here we work with a matrix.

head(iris) #here we are using the iris dataset

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

OUTMatrix = function(x,y,s,e){
   mat = x[(y<s | y>e),]
   return(mat)
   }
OM = OUTMatrix(iris, iris$Sepal.Length,5,6)
head(OM)

##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 2           4.9         3.0          1.4         0.2  setosa
## 3           4.7         3.2          1.3         0.2  setosa
## 4           4.6         3.1          1.5         0.2  setosa
## 7           4.6         3.4          1.4         0.3  setosa
## 9           4.4         2.9          1.4         0.2  setosa
## 10          4.9         3.1          1.5         0.1  setosa

INMatrix = function(x,y,s,e){
  mat = x[(y>s & y<e),]
  return(mat)
}
IM = INMatrix(iris, iris$Sepal.Length,5,6)
head(IM)

##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1           5.1         3.5          1.4         0.2  setosa
## 6           5.4         3.9          1.7         0.4  setosa
## 11          5.4         3.7          1.5         0.2  setosa
## 15          5.8         4.0          1.2         0.2  setosa
## 16          5.7         4.4          1.5         0.4  setosa
## 17          5.4         3.9          1.3         0.4  setosa

INFOOULIERS() returns a summary info about the values that are outside the range of the values selected (s,e):

INFOOUTLIERS = function(x,s,e){
  x1 = x[x<s | x>e]
  n = x[is.na(x) == TRUE]
  x2 = sort(x1)
  x3 = unique(x2)
  n1 = unique(n)
  x4 = min(na.omit(x1))
  x5 = max(na.omit(x1))
  x6 = length(x1)
  return(cat("\n", "SORTED OUTLIERS:", x2, n, "||", "\n", "OUTLIER VALUES:", x3, n1, "||","\n", "MIN:", x4," ", "MAX:", x5,"||","\n", "QUANTITY OF OUTLIERS:", x6, "||"))
}
x = c(1:50, 1:50, NA, NA)
x

##   [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
##  [24] 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
##  [47] 47 48 49 50  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19
##  [70] 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
##  [93] 43 44 45 46 47 48 49 50 NA NA

INFOOUTLIERS(x,4,45)

## 
##  SORTED OUTLIERS: 1 1 2 2 3 3 46 46 47 47 48 48 49 49 50 50 NA NA || 
##  OUTLIER VALUES: 1 2 3 46 47 48 49 50 NA || 
##  MIN: 1   MAX: 50 || 
##  QUANTITY OF OUTLIERS: 18 ||

OUTLIERSDIF() returns summary values (min, 1stquantile, median, mean, 3rdquantil, max) taking into account all the data, and also, with the values from the range (s,e) removed, This allows us to compare how the summary values of the data have been modified when removing the values from the range (s,e).

OUTLIERSDIF = function(x,s,e){
  x = x[]
  x = x[!is.na(x)]
  y = x[x>s & x<e]
  x1 = min(x)
  x2 = quantile(x,0.25)
  x3 = median(x)
  x4 = mean(x)
  x5 = quantile(x,0.75)
  x6 = max(x)
  y1 = min(y)
  y2 = quantile(y,0.25, na.rm = TRUE)
  y3 = median(y)
  y4 = mean(y)
  y5 = quantile(y,0.75, na.rm = TRUE)
  y6 = max(y)
return(cat("\n", "ALL DATA SUMMARY:", "MIN:", x1, "-", "1stQUANT:", x2 , "-", "MEDIAN:", x3 ,"-", "MEAN:", x4 ,"-", "3rdQUANT", x5 ,"-", "MAX:", x6 , "\n", "\n", "OUTLIERS REMOVED:","MIN:", y1, "-", "1stQUANT:", y2, "-",  "MEDIAN:", y3, "-",  "MEAN:", y4, "-",  "3rdQUANT", y5, "-",  "MAX:", y6))
}

OUTLIERSDIF(x, 7,44)

## 
##  ALL DATA SUMMARY: MIN: 1 - 1stQUANT: 13 - MEDIAN: 25.5 - MEAN: 25.5 - 3rdQUANT 38 - MAX: 50 
##  
##  OUTLIERS REMOVED: MIN: 8 - 1stQUANT: 16.75 - MEDIAN: 25.5 - MEAN: 25.5 - 3rdQUANT 34.25 - MAX: 43

GRAPHOUTLIERS() return a boxplot that compares the summary values when using all the data and when using the data within the range values (s,e). For further information about boxplot values: http://dataworldblog.blogspot.com.es/2017/06/univariate-graphs-part-3-boxplot.html

GRAPHOUTLIERS = function(x,s,e){
  x = x[]
  y = x[x>s & x<e]
  boxplot(x, y, notch = TRUE, main = "GRAPHOUTLIERS", col = c("pink", "orange"), names = c("All data", "Without outliers"))
}

GRAPHOUTLIERS(x,7,44)

Data World Blog

Search This Blog

Initial data analysis functions

Create a function in R:

Initial Data Analysis Functions

Labels

Popular posts from this blog

Support Vector Machines (SVM) in R (package 'kernlab')

Initial Data Analysis (infert dataset)

Ant Colony Optimization (part 2) : Graph optimization using ACO