Code aus der Vorlesung

Vorlesung vom 06.12.2024

Häufigkeiten

# Erstellen eines Vektors mit den Verspätungen
verspaetungen <- c(10, 20, 5, 10, 30, 
                   25, 5, 5, 10, 20, 
                   15, 10, 5, 20, 15, 
                   10, 5, 20, 25, 10)

#Tabelle mit absoluten Häufigkeiten
table(verspaetungen)

verspaetungen
 5 10 15 20 25 30 
 5  6  2  4  2  1

#Tabelle mit relativen Häufigkeiten
prop.table(table(verspaetungen))

verspaetungen
   5   10   15   20   25   30 
0.25 0.30 0.10 0.20 0.10 0.05

# Datensätze in R
?mtcars
table(mtcars$gear)


 3  4  5 
15 12  5

Visualisierung von Häufigkeiten

# Stabiagramm auf Basis absoluter Häufigkeiten
plot(table(verspaetungen), type = "h",
     xlab = "Verspätungen",
     ylab = "absolute Häufigkeiten")

# Säulendiagramm auf Basis absoluter Häufigkeiten
barplot(table(verspaetungen),
        xlab = "Verspätungen",
        ylab = "absolute Häufigkeiten")

plot(table(mtcars$hp), 
     type = "h",
     xlab = "PS",
     ylab = "absolute Häufigkeit")

# Zugriff auf einzelne Elemente
verspaetungen[10]

[1] 20

verspaetungen[c(5,10)]

[1] 30 20

mtcars[10,5]

[1] 3.92

mtcars[10,]

          mpg cyl  disp  hp drat   wt qsec vs am gear carb
Merc 280 19.2   6 167.6 123 3.92 3.44 18.3  1  0    4    4

mtcars[,5]

 [1] 3.90 3.90 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 3.92 3.07 3.07 3.07 2.93
[16] 3.00 3.23 4.08 4.93 4.22 3.70 2.76 3.15 3.73 3.08 4.08 4.43 3.77 4.22 3.62
[31] 3.54 4.11

Histogramm

# Histogramm

hist(mtcars$hp, 
#     breaks = c(0,50,100,200,500),
     freq = FALSE,
     xlab = "PS",
     main = "Histogramm von PS")

Empirische Verteilungsfunktion

# Empirische Verteilungsfunktion

plot(ecdf(mtcars$hp))

plot(ecdf(rnorm(100000)))

Boxplot

# Boxplot
boxplot(mtcars$hp, horizontal = TRUE)

Streuungsmaße

## Streuungsmaße

mieten <- c(300,250,400,500,250,600,300,300,450,400)

mittlere_absolute_abweichung <- mean(abs(mieten - median(mieten)))
varianz <- mean((mieten - mean(mieten))^2)
standardabweichung <- sqrt(varianz)

var(mieten)*9/10

[1] 12125

Vorlesung vom 13.12.2024

Softwareaufgabe 3.1

library(ggplot2)



# a. Durchschnitte berechnen
mean(diamonds$carat)

[1] 0.7979397

median(diamonds$carat)

[1] 0.7

# b. Dezile bestimmen
quantile(diamonds$carat, probs = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9))

 10%  20%  30%  40%  50%  60%  70%  80%  90% 
0.31 0.35 0.42 0.53 0.70 0.90 1.01 1.13 1.51

# c. Visualisierung der Häufigkeiten
# Boxplot
boxplot(diamonds$carat)

# Histogramm
hist(diamonds$carat, breaks = 30)

# empirische Verteilungsfunktion
plot(ecdf(diamonds$carat))

# Häufigkeiten von "cut"
barplot(table(diamonds$cut))

## Streuung von 'carat'
sd(diamonds$carat)

[1] 0.4740112

# Korrelation

cor(diamonds$carat, diamonds$price)

[1] 0.9215913

plot(x = diamonds$carat,
     y = diamonds$price,
     xlab = "Karat",
     ylab = "Preis")

# Logarithmische Skala

plot(x = log(diamonds$carat),
     y = log(diamonds$price),
     xlab = "Karat",
     ylab = "Preis")

cor(log(diamonds$carat),
    log(diamonds$price))

[1] 0.9659137

# Korrelationsmatrix
install.packages("MSBStatsData")

The following package(s) will be installed:
- MSBStatsData [0.0.2]
These packages will be installed into "/builds/buecker/buecker.fh-muenster.io/renv/library/linux-ubuntu-noble/R-4.5/x86_64-pc-linux-gnu".

# Installing packages --------------------------------------------------------
- Installing MSBStatsData ...                   OK [linked from cache]
Successfully installed 1 package in 8.6 milliseconds.

library(MSBStatsData)

round(cor(decathlon),2)

                race100m longjump shotput highjump race400m race110mhurdles
race100m            1.00    -0.48   -0.15    -0.12     0.57            0.45
longjump           -0.48     1.00    0.25     0.36    -0.31           -0.38
shotput            -0.15     0.25    1.00     0.16    -0.03           -0.25
highjump           -0.12     0.36    0.16     1.00    -0.11           -0.25
race400m            0.57    -0.31   -0.03    -0.11     1.00            0.38
race110mhurdles     0.45    -0.38   -0.25    -0.25     0.38            1.00
discus             -0.12     0.20    0.72     0.14    -0.04           -0.22
polevault          -0.17     0.27    0.25     0.19    -0.13           -0.29
javelinthrow       -0.06     0.17    0.44     0.07    -0.02           -0.13
race1500m          -0.09     0.02    0.11    -0.01     0.38            0.01
                discus polevault javelinthrow race1500m
race100m         -0.12     -0.17        -0.06     -0.09
longjump          0.20      0.27         0.17      0.02
shotput           0.72      0.25         0.44      0.11
highjump          0.14      0.19         0.07     -0.01
race400m         -0.04     -0.13        -0.02      0.38
race110mhurdles  -0.22     -0.29        -0.13      0.01
discus            1.00      0.27         0.42      0.08
polevault         0.27      1.00         0.19     -0.01
javelinthrow      0.42      0.19         1.00      0.02
race1500m         0.08     -0.01         0.02      1.00

plot(x = decathlon$race100m, 
     y = decathlon$longjump)

plot(decathlon)

Zusammenhänge

df_rating <- data.frame(
  Unternehmen = 1:10,
  Rating =  factor(c("A+", "BB+", "AA-", "AA", "BBB", 
                     "BBB+", "AA", "AAA", "AA+", "AA"), 
                   order = TRUE,
                   levels = c("BB+", "BBB", "BBB+", "A+", 
                              "AA-", "AA", "AA+", "AAA")),
  Unternehmensgroeße = factor(c("groß","mittel","groß",
                                "groß","klein", "mittel",
                                "mittel","groß","groß",
                                "mittel"),
                              ordered = TRUE,
                              levels = c("klein", "mittel", 
                                         "groß"))
)


# Rangkorrelationskoeffizient nach Spearman
cor(rank(df_rating$Rating, 
         ties.method = "average"),
    rank(df_rating$Unternehmensgroeße, 
         ties.method = "average"),
    method = c("spearman"))

[1] 0.5765528

# Rangkorrelationskoeffizient nach Kendall
cor(rank(df_rating$Rating, 
         ties.method = "average"),
    rank(df_rating$Unternehmensgroeße, 
         ties.method = "average"),
    method = c("kendall"))

[1] 0.487108

library(ggplot2)


cor(rank(diamonds$color, ties.method = "average"),
    rank(diamonds$clarity, ties.method = "average"),
    method = "spearman")

[1] 0.03031204

## Kontingenzmaße

kontingenztafel <- table(am = mtcars$am, gear = mtcars$gear)
berechnung <- chisq.test(kontingenztafel)

# X^2
berechnung$statistic

X-squared 
 20.94467

# Cramérs V
k <- min(dim(kontingenztafel))
n <- sum(kontingenztafel)

as.numeric(sqrt(berechnung$statistic/(n*(k-1))))

[1] 0.8090247

plot(mtcars$wt, mtcars$mpg)

cor(mtcars$wt, mtcars$mpg)

[1] -0.8676594

gewicht <- cut(mtcars$wt, c(0,3.5,6))
reichweite <- cut(mtcars$mpg, c(10,20,40))

kontingenztafel <- chisq.test(table(gewicht, reichweite))

# Cramers V
sqrt(kontingenztafel$statistic / 32)

X-squared 
0.5719694

Regression

library(ggplot2)

regression <- lm(price ~ carat, data = diamonds)

plot(x = diamonds$carat,
     y = diamonds$price,
     xlab = "Gewicht (Karat)",
     ylab = "Preis (USD)")
abline(regression, col = "red")

Vorlesung vom 14.12.2024

Übungsaufgabe 5.9

# Daten erzeugen
wohnflaeche <- c(55,65,65,80,95)
kaltmiete <- c(300,340,410,435,530)
df <- data.frame(wohnflaeche, kaltmiete)

# Korrelationen berechnen
cor(wohnflaeche, kaltmiete)

[1] 0.9532826

cor(wohnflaeche, kaltmiete, method = "spearman")

[1] 0.9746794

cor(wohnflaeche, kaltmiete, method = "kendall")

[1] 0.9486833

# Zeichnung
plot(x = wohnflaeche, y = kaltmiete,
     xlab = "Wohnfläche",
     ylab = "Kaltmiete",
     main = "Zusammenhang Wohnfläche und Kaltmiete",
     xlim = c(0,100),
     ylim = c(0,600))

# Regression
regression <- lm(kaltmiete ~ wohnflaeche, data = df)
regression


Call:
lm(formula = kaltmiete ~ wohnflaeche, data = df)

Coefficients:
(Intercept)  wohnflaeche  
     12.143        5.429

summary_regression <- summary(regression)
summary_regression$r.squared

[1] 0.9087476

abline(regression, col = "red")

# Vorhersagen
df_neu <- data.frame(wohnflaeche = c(75,47,62))
predict(regression, newdata = df_neu)

       1        2        3 
419.2857 267.2857 348.7143

Beispiel Multiple Regression aus der Vorlesung

df_icecream <- data.frame(
  Eismenge = c(2000,2000,6000,4000,6000,
               4000,4000,7000,7000,8000),
  Temperatur =  c(10,  15,  20,  15,  25,  
                  25,  30,  30,  40,  40),
  Niederschlag = c(25,20,15,20,10,8,1,1,0,0),
  Wochentag = factor(c("Mittwoch", "Freitag", "Sonntag", 
                       "Freitag","Sonntag", "Mittwoch", 
                       "Freitag", "Sonntag", "Freitag", 
                       "Sonntag"), 
                     levels = c("Mittwoch", 
                                "Freitag", 
                                "Sonntag"))
) 

# Berechnen des Mutiplen Linearen Modells
lm_model <- lm(Eismenge ~ Temperatur + Niederschlag + Wochentag, 
               data = df_icecream)

summary(lm_model)


Call:
lm(formula = Eismenge ~ Temperatur + Niederschlag + Wochentag, 
    data = df_icecream)

Residuals:
     1      2      3      4      5      6      7      8      9     10 
-155.5 -996.1  349.7 1003.9 -267.2  155.5 -523.7  439.0  515.9 -521.5 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)  
(Intercept)      -1904.45    3111.82  -0.612   0.5673  
Temperatur         204.12      88.24   2.313   0.0686 .
Niederschlag        80.75      94.42   0.855   0.4315  
WochentagFreitag   223.80     731.48   0.306   0.7720  
WochentagSonntag  2261.17     769.61   2.938   0.0323 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 805.9 on 5 degrees of freedom
Multiple R-squared:  0.9188,    Adjusted R-squared:  0.8539 
F-statistic: 14.15 on 4 and 5 DF,  p-value: 0.00619

# Erstellen Sie eine Vorhersage für einen Tag 
# mit 25°C und 0mm Niederschlag!

df_icecream_new <- data.frame(Temperatur = c(25),
                              Niederschlag = c(0),
                              Wochentag = "Sonntag")

predict(lm_model, newdata = df_icecream_new)

      1 
5459.68

Vorlesung vom 20.12.2024

Übungsaufgabe 6.4

## Laden der Pakete
# install.packages("MSBStatsData")
library(MSBStatsData)


## Regression berechnen

lm_zehnkampf <- lm(longjump ~ race100m, data = decathlon)
lm_zehnkampf


Call:
lm(formula = longjump ~ race100m, data = decathlon)

Coefficients:
(Intercept)     race100m  
    13.0822      -0.5433

## Grafische Darstellung
plot(x = decathlon$race100m,
     y = decathlon$longjump,
     xlab = "Zeit im 100-Meter-Lauf (in Sekunden)",
     ylab = "Weite im Weitsprung (in Metern)")

abline(lm_zehnkampf, col = "red")

## Bestimmtheitsmaß bzw. R^2
summary(lm_zehnkampf)


Call:
lm(formula = longjump ~ race100m, data = decathlon)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.02828 -0.18094  0.00203  0.19069  0.92360 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 13.08220    0.12390  105.59   <2e-16 ***
race100m    -0.54331    0.01101  -49.35   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.2778 on 7966 degrees of freedom
Multiple R-squared:  0.2342,    Adjusted R-squared:  0.2341 
F-statistic:  2436 on 1 and 7966 DF,  p-value: < 2.2e-16

# Vorhersage

decathlon_neu <- data.frame(race100m = 10.5)
predict(lm_zehnkampf, newdata = decathlon_neu)

       1 
7.377496

Wahrscheinlichkeitsverteilungen

# Gleichverteilung
## Dichtefunktion
dunif(x = 2, min = 0, max = 10)

[1] 0.1

## Verteilungsfunktion

punif(q = 2, min = 0, max = 10) # F(2) = P(X<=2)

[1] 0.2

punif(q = -2, min = 0, max = 10)

[1] 0

punif(q = 12, min = 0, max = 10)

[1] 1

## Wahrscheinlichkeit für X zwischen 2 und 5
## P(2 <= X <= 5) = F(5) - F(2)
punif(5,0,10) - punif(2,0,10)

[1] 0.3

## Pseudo-Zufallszahlen

zz <- runif(10000000, min = 0, max = 10)
hist(zz, freq = FALSE, xlim = c(-5,15))

Vorlesung vom 17.01.2025

Würfelwurf

# Würfel
x <- sample(1:6, 10000000, replace = TRUE)
table(x)

x
      1       2       3       4       5       6 
1665911 1667012 1667954 1667412 1665333 1666378

Binomialverteiliung

# Anzahl Versuche: 5, Erfolgswahrscheinlichkeit: 0.6
# Gesucht: P(X>=2) = 1-P(X<=1) = 1-F(1)
1-pbinom(1, size = 5, prob = 0.6)

[1] 0.91296

dbinom(2,5,0.6)+dbinom(3,5,0.6)+dbinom(4,5,0.6)+dbinom(5,5,0.6)

[1] 0.91296

sum(dbinom(2:5,5,0.6))

[1] 0.91296

Normalverteilung

# X ~ N(1,4)
# gesucht: P(0 <= X <= 4) = P(X<=4) - P(X<=0)
#                         = F(4) - F(0)

pnorm(4, mean = 1, sd = sqrt(4)) -
  pnorm(0, mean = 1, sd = sqrt(4))

[1] 0.6246553

Aufgabe 8.9

1-pnorm(5,0.5,2)

[1] 0.01222447

pnorm(4,0.5,2)-pnorm(-4,0.5,2)

[1] 0.9477164

Aufgabe 8.8

pbinom(1, size = 30, prob = 0.15)

[1] 0.0480289

Aufgabe aus Vorlesung

pbinom(2,20,0.1)

[1] 0.6769268

1-pbinom(100,900,0.1)

[1] 0.122629

1-pnorm(10/9,0,1)

[1] 0.1332603

Aufgabe 10.1

x <- c(1.95, 1.80, 2.10, 2.82, 1.75, 2.01, 1.83, 1.90)
mean(x)

[1] 2.02

Konfidenzintervall

alpha <- 0.05
qnorm(1-alpha/2)

[1] 1.959964

t.test(x, conf.level=0.95)


    One Sample t-test

data:  x
t = 16.661, df = 7, p-value = 6.859e-07
alternative hypothesis: true mean is not equal to 0
95 percent confidence interval:
 1.733305 2.306695
sample estimates:
mean of x 
     2.02

Vorlesung vom 18.01.2025

Aufgabe 10.3

einkommen <- c(760, 540, 2400, 3900, 1200,
               1300, 100, 1760, 400, 4800)
n <- length(einkommen)
mean(einkommen)

[1] 1716

sd(einkommen)

[1] 1557.428

alpha = 0.1
qt(1-alpha/2, df = n-1)

[1] 1.833113

mean(einkommen) - qt(1-alpha/2, df = n-1) * sd(einkommen)/sqrt(n)

[1] 813.1882

mean(einkommen) + qt(1-alpha/2, df = n-1) * sd(einkommen)/sqrt(n)

[1] 2618.812

t.test(einkommen, conf.level = 0.9)


    One Sample t-test

data:  einkommen
t = 3.4842, df = 9, p-value = 0.006892
alternative hypothesis: true mean is not equal to 0
90 percent confidence interval:
  813.1882 2618.8118
sample estimates:
mean of x 
     1716

Aufgabe 10.7

binom.test(x = 290, n = 808, conf.level = 0.99)


    Exact binomial test

data:  290 and 808
number of successes = 290, number of trials = 808, p-value = 9.053e-16
alternative hypothesis: true probability of success is not equal to 0.5
99 percent confidence interval:
 0.3157707 0.4037399
sample estimates:
probability of success 
             0.3589109

Aufgabe 11.4

binom.test(x = 1395, n = 2000, alternative = "greater")


    Exact binomial test

data:  1395 and 2000
number of successes = 1395, number of trials = 2000, p-value < 2.2e-16
alternative hypothesis: true probability of success is greater than 0.5
95 percent confidence interval:
 0.6801479 1.0000000
sample estimates:
probability of success 
                0.6975

Aufgabe 11.7

zufriedenheiten <- c(260,150,30,60,
                     280,140,30,50)
zufriedenheiten_matrix <- matrix(zufriedenheiten, nrow = 2, byrow = TRUE)

chisq.test(zufriedenheiten_matrix)


    Pearson's Chi-squared test

data:  zufriedenheiten_matrix
X-squared = 1.9947, df = 3, p-value = 0.5735

# Der p-Wert von 0.5735 ist größer als alpha = 0.05, d.h.
# die Nullhypothese kann nicht abgelehnt werden, d.h.
# Gewerkschaftszugehörigkeit und Zufriedenheit sind unabhängig

Zurück nach oben