Added dataset and available code

2021-12-06 14:27:57 +01:00
parent f5809d3b42
commit 87fdc8ad53
3 changed files with 1304 additions and 0 deletions
--- a/project/read_SouthGermanCredit.R
+++ b/project/read_SouthGermanCredit.R
@@ -0,0 +1,170 @@
+  setwd("/config/workspace/assistenz-r/dataset")
+dat <- read.table("SouthGermanCredit.asc", header=TRUE) 
+
+## dat contains numbers for all variables.
+
+## variables duration, amount and age are truly quantitative
+## variables installment_rate, present_residence and number_credits are
+    ### quantitative in the data, but are in fact discretized scores for 
+    ### an underlying quantitative variable
+    ### and are thus stored as ordered factors below
+## variable people_liable is quantitative in the data but is in fact 
+    ### a binarized score (less 0 to 2 versus 3 or more)
+    ### and is thus stored as a factor below
+## all the numeric values (=level codes) 
+    ### for the categorical variables 
+    ### (including the discretized quantitative variables), 
+    ### are the P2 scores from Häußler (1979) 
+    ### which can be directly used in credit scoring (larger=better).
+    ### (Exceptions have been corrected in the raw data, 
+    ###     which implies that columns pers and gastarb have 
+    ###     entries opposite to those in Open Data LMU (2010)
+    ###     and the GermanCredit data from the UCI ML Repo.)
+
+## variable names from Fahrmeir/Hamerle book
+nam_fahrmeirbook <- colnames(dat)
+
+### assign levels 
+### level assignment can be sanity-checked 
+### with Table 2.1 from the Fahrmeir/Hamerle book, 
+###     which gives proportions separated for good and bad credit risks.
+### That table is provided with by Open Data LMU 
+###     (https://doi.org/10.5282/ubm/data.23)
+###     together with a German language version of the data set.
+### A corresponding table for the English language data is produced 
+###     below for the final data (levels ordered by increasing code).
+### Level labels have been taken from package evtree, except for 
+###     the variable telephone (where the yes level has been made more detailed)
+###     and those variables that were quantitative and do not have level labels
+###     in evtree.
+
+nam_evtree <- c("status", "duration", "credit_history", "purpose", "amount", 
+                "savings", "employment_duration", "installment_rate",
+                "personal_status_sex", "other_debtors",
+                "present_residence", "property",
+                "age", "other_installment_plans",
+                "housing", "number_credits",
+                "job", "people_liable", "telephone", "foreign_worker",
+                "credit_risk")
+names(dat) <- nam_evtree
+
+## make factors for all except the numeric variables
+## make sure that even empty level of factor purpose = verw (dat[[4]]) is included
+for (i in setdiff(1:21, c(2,4,5,13)))
+  dat[[i]] <- factor(dat[[i]])
+## factor purpose
+dat[[4]] <- factor(dat[[4]], levels=as.character(0:10))
+
+## assign level codes
+## make intrinsically ordered factors into class ordered 
+levels(dat$credit_risk) <- c("bad", "good")
+levels(dat$status) = c("no checking account",
+                         "... < 0 DM",
+                         "0<= ... < 200 DM",
+                         "... >= 200 DM / salary for at least 1 year")
+## "critical account/other credits elsewhere" was
+## "critical account/other credits existing (not at this bank)",
+levels(dat$credit_history) <- c(
+  "delay in paying off in the past",
+  "critical account/other credits elsewhere",
+  "no credits taken/all credits paid back duly",
+  "existing credits paid back duly till now",
+  "all credits at this bank paid back duly")
+levels(dat$purpose) <- c(
+  "others",
+  "car (new)",
+  "car (used)",
+  "furniture/equipment",
+  "radio/television",
+  "domestic appliances",
+  "repairs",
+  "education", 
+  "vacation",
+  "retraining",
+  "business")
+levels(dat$savings) <- c("unknown/no savings account",
+                         "... <  100 DM", 
+                         "100 <= ... <  500 DM",
+                         "500 <= ... < 1000 DM", 
+                         "... >= 1000 DM")
+levels(dat$employment_duration) <- 
+                  c(  "unemployed", 
+                      "< 1 yr", 
+                      "1 <= ... < 4 yrs",
+                      "4 <= ... < 7 yrs", 
+                      ">= 7 yrs")
+dat$installment_rate <- ordered(dat$installment_rate)
+levels(dat$installment_rate) <- c(">= 35", 
+                                  "25 <= ... < 35",
+                                  "20 <= ... < 25", 
+                                  "< 20")
+levels(dat$other_debtors) <- c(
+  "none",
+  "co-applicant",
+  "guarantor"
+)
+## female : nonsingle was female : divorced/separated/married
+##    widowed females are not mentioned in the code table
+levels(dat$personal_status_sex) <- c(
+  "male : divorced/separated",
+  "female : non-single or male : single",
+  "male : married/widowed",
+  "female : single")
+dat$present_residence <- ordered(dat$present_residence)
+levels(dat$present_residence) <- c("< 1 yr", 
+                                   "1 <= ... < 4 yrs", 
+                                   "4 <= ... < 7 yrs", 
+                                   ">= 7 yrs")
+## "building soc. savings agr./life insurance", 
+##    was "building society savings agreement/life insurance"
+levels(dat$property) <- c(
+  "unknown / no property",
+  "car or other",
+  "building soc. savings agr./life insurance", 
+  "real estate"
+)
+levels(dat$other_installment_plans) <- c(
+  "bank",
+  "stores",
+  "none"
+)
+levels(dat$housing) <- c("for free", "rent", "own")
+dat$number_credits <- ordered(dat$number_credits)
+levels(dat$number_credits) <- c("1", "2-3", "4-5", ">= 6")
+## manager/self-empl./highly qualif. employee  was
+##   management/self-employed/highly qualified employee/officer
+levels(dat$job) <- c(
+  "unemployed/unskilled - non-resident",
+  "unskilled - resident",
+  "skilled employee/official",
+  "manager/self-empl./highly qualif. employee"
+)
+levels(dat$people_liable) <- c("3 or more", "0 to 2")
+levels(dat$telephone) <- c("no", "yes (under customer name)")
+levels(dat$foreign_worker) <- c("yes", "no")
+
+## checks against fahrmeir table
+tabs <- 
+list(status = round(100*prop.table(xtabs(~status+credit_risk, dat),2),2),
+credit_history = round(100*prop.table(xtabs(~credit_history+credit_risk, dat),2),2),
+purpose = round(100*prop.table(xtabs(~purpose+credit_risk, dat),2),2),
+savings = round(100*prop.table(xtabs(~savings+credit_risk, dat),2),2),
+employment_duration = round(100*prop.table(xtabs(~employment_duration+credit_risk, dat),2),2),
+installment_rate = round(100*prop.table(xtabs(~installment_rate+credit_risk, dat),2),2),
+personal_status_sex = round(100*prop.table(xtabs(~personal_status_sex+credit_risk, dat),2),2),
+other_debtors = round(100*prop.table(xtabs(~other_debtors+credit_risk, dat),2),2),
+present_residence = round(100*prop.table(xtabs(~present_residence+credit_risk, dat),2),2),
+property = round(100*prop.table(xtabs(~property+credit_risk, dat),2),2),
+other_installment_plans = round(100*prop.table(xtabs(~other_installment_plans+credit_risk, dat),2),2),
+housing = round(100*prop.table(xtabs(~housing+credit_risk, dat),2),2),
+number_credits = round(100*prop.table(xtabs(~number_credits+credit_risk, dat),2),2),
+job = round(100*prop.table(xtabs(~job+credit_risk, dat),2),2),
+people_liable = round(100*prop.table(xtabs(~people_liable+credit_risk, dat),2),2),
+telephone = round(100*prop.table(xtabs(~telephone+credit_risk, dat),2),2),
+foreign_worker = round(100*prop.table(xtabs(~foreign_worker+credit_risk, dat),2),2))
+
+## variables for which a tab entry is available
+## (all except 2, 5 and 13)
+tabwhich <- setdiff(1:20, c(2,5,13))
+
+print(tabs)