Added dataset and available code
This commit is contained in:
parent
f5809d3b42
commit
87fdc8ad53
1001
dataset/SouthGermanCredit.asc
Normal file
1001
dataset/SouthGermanCredit.asc
Normal file
File diff suppressed because it is too large
Load Diff
133
dataset/codetable.txt
Normal file
133
dataset/codetable.txt
Normal file
@ -0,0 +1,133 @@
|
|||||||
|
$`laufkont = status`
|
||||||
|
|
||||||
|
1 : no checking account
|
||||||
|
2 : ... < 0 DM
|
||||||
|
3 : 0<= ... < 200 DM
|
||||||
|
4 : ... >= 200 DM / salary for at least 1 year
|
||||||
|
|
||||||
|
$`laufzeit = duration`
|
||||||
|
|
||||||
|
|
||||||
|
$`moral = credit_history`
|
||||||
|
|
||||||
|
0 : delay in paying off in the past
|
||||||
|
1 : critical account/other credits elsewhere
|
||||||
|
2 : no credits taken/all credits paid back duly
|
||||||
|
3 : existing credits paid back duly till now
|
||||||
|
4 : all credits at this bank paid back duly
|
||||||
|
|
||||||
|
$`verw = purpose`
|
||||||
|
|
||||||
|
0 : others
|
||||||
|
1 : car (new)
|
||||||
|
2 : car (used)
|
||||||
|
3 : furniture/equipment
|
||||||
|
4 : radio/television
|
||||||
|
5 : domestic appliances
|
||||||
|
6 : repairs
|
||||||
|
7 : education
|
||||||
|
8 : vacation
|
||||||
|
9 : retraining
|
||||||
|
10 : business
|
||||||
|
|
||||||
|
$`hoehe = amount`
|
||||||
|
|
||||||
|
|
||||||
|
$`sparkont = savings`
|
||||||
|
|
||||||
|
1 : unknown/no savings account
|
||||||
|
2 : ... < 100 DM
|
||||||
|
3 : 100 <= ... < 500 DM
|
||||||
|
4 : 500 <= ... < 1000 DM
|
||||||
|
5 : ... >= 1000 DM
|
||||||
|
|
||||||
|
$`beszeit = employment_duration`
|
||||||
|
|
||||||
|
1 : unemployed
|
||||||
|
2 : < 1 yr
|
||||||
|
3 : 1 <= ... < 4 yrs
|
||||||
|
4 : 4 <= ... < 7 yrs
|
||||||
|
5 : >= 7 yrs
|
||||||
|
|
||||||
|
$`rate = installment_rate`
|
||||||
|
|
||||||
|
1 : >= 35
|
||||||
|
2 : 25 <= ... < 35
|
||||||
|
3 : 20 <= ... < 25
|
||||||
|
4 : < 20
|
||||||
|
|
||||||
|
$`famges = personal_status_sex`
|
||||||
|
|
||||||
|
1 : male : divorced/separated
|
||||||
|
2 : female : non-single or male : single
|
||||||
|
3 : male : married/widowed
|
||||||
|
4 : female : single
|
||||||
|
|
||||||
|
$`buerge = other_debtors`
|
||||||
|
|
||||||
|
1 : none
|
||||||
|
2 : co-applicant
|
||||||
|
3 : guarantor
|
||||||
|
|
||||||
|
$`wohnzeit = present_residence`
|
||||||
|
|
||||||
|
1 : < 1 yr
|
||||||
|
2 : 1 <= ... < 4 yrs
|
||||||
|
3 : 4 <= ... < 7 yrs
|
||||||
|
4 : >= 7 yrs
|
||||||
|
|
||||||
|
$`verm = property`
|
||||||
|
|
||||||
|
1 : unknown / no property
|
||||||
|
2 : car or other
|
||||||
|
3 : building soc. savings agr./life insurance
|
||||||
|
4 : real estate
|
||||||
|
|
||||||
|
$`alter = age`
|
||||||
|
|
||||||
|
|
||||||
|
$`weitkred = other_installment_plans`
|
||||||
|
|
||||||
|
1 : bank
|
||||||
|
2 : stores
|
||||||
|
3 : none
|
||||||
|
|
||||||
|
$`wohn = housing`
|
||||||
|
|
||||||
|
1 : for free
|
||||||
|
2 : rent
|
||||||
|
3 : own
|
||||||
|
|
||||||
|
$`bishkred = number_credits`
|
||||||
|
|
||||||
|
1 : 1
|
||||||
|
2 : 2-3
|
||||||
|
3 : 4-5
|
||||||
|
4 : >= 6
|
||||||
|
|
||||||
|
$`beruf = job`
|
||||||
|
|
||||||
|
1 : unemployed/unskilled - non-resident
|
||||||
|
2 : unskilled - resident
|
||||||
|
3 : skilled employee/official
|
||||||
|
4 : manager/self-empl./highly qualif. employee
|
||||||
|
|
||||||
|
$`pers = people_liable`
|
||||||
|
|
||||||
|
1 : 3 or more
|
||||||
|
2 : 0 to 2
|
||||||
|
|
||||||
|
$`telef = telephone`
|
||||||
|
|
||||||
|
1 : no
|
||||||
|
2 : yes (under customer name)
|
||||||
|
|
||||||
|
$`gastarb = foreign_worker`
|
||||||
|
|
||||||
|
1 : yes
|
||||||
|
2 : no
|
||||||
|
|
||||||
|
$`kredit = credit_risk`
|
||||||
|
|
||||||
|
0 : bad
|
||||||
|
1 : good
|
170
project/read_SouthGermanCredit.R
Normal file
170
project/read_SouthGermanCredit.R
Normal file
@ -0,0 +1,170 @@
|
|||||||
|
setwd("/config/workspace/assistenz-r/dataset")
|
||||||
|
dat <- read.table("SouthGermanCredit.asc", header=TRUE)
|
||||||
|
|
||||||
|
## dat contains numbers for all variables.
|
||||||
|
|
||||||
|
## variables duration, amount and age are truly quantitative
|
||||||
|
## variables installment_rate, present_residence and number_credits are
|
||||||
|
### quantitative in the data, but are in fact discretized scores for
|
||||||
|
### an underlying quantitative variable
|
||||||
|
### and are thus stored as ordered factors below
|
||||||
|
## variable people_liable is quantitative in the data but is in fact
|
||||||
|
### a binarized score (less 0 to 2 versus 3 or more)
|
||||||
|
### and is thus stored as a factor below
|
||||||
|
## all the numeric values (=level codes)
|
||||||
|
### for the categorical variables
|
||||||
|
### (including the discretized quantitative variables),
|
||||||
|
### are the P2 scores from Häußler (1979)
|
||||||
|
### which can be directly used in credit scoring (larger=better).
|
||||||
|
### (Exceptions have been corrected in the raw data,
|
||||||
|
### which implies that columns pers and gastarb have
|
||||||
|
### entries opposite to those in Open Data LMU (2010)
|
||||||
|
### and the GermanCredit data from the UCI ML Repo.)
|
||||||
|
|
||||||
|
## variable names from Fahrmeir/Hamerle book
|
||||||
|
nam_fahrmeirbook <- colnames(dat)
|
||||||
|
|
||||||
|
### assign levels
|
||||||
|
### level assignment can be sanity-checked
|
||||||
|
### with Table 2.1 from the Fahrmeir/Hamerle book,
|
||||||
|
### which gives proportions separated for good and bad credit risks.
|
||||||
|
### That table is provided with by Open Data LMU
|
||||||
|
### (https://doi.org/10.5282/ubm/data.23)
|
||||||
|
### together with a German language version of the data set.
|
||||||
|
### A corresponding table for the English language data is produced
|
||||||
|
### below for the final data (levels ordered by increasing code).
|
||||||
|
### Level labels have been taken from package evtree, except for
|
||||||
|
### the variable telephone (where the yes level has been made more detailed)
|
||||||
|
### and those variables that were quantitative and do not have level labels
|
||||||
|
### in evtree.
|
||||||
|
|
||||||
|
nam_evtree <- c("status", "duration", "credit_history", "purpose", "amount",
|
||||||
|
"savings", "employment_duration", "installment_rate",
|
||||||
|
"personal_status_sex", "other_debtors",
|
||||||
|
"present_residence", "property",
|
||||||
|
"age", "other_installment_plans",
|
||||||
|
"housing", "number_credits",
|
||||||
|
"job", "people_liable", "telephone", "foreign_worker",
|
||||||
|
"credit_risk")
|
||||||
|
names(dat) <- nam_evtree
|
||||||
|
|
||||||
|
## make factors for all except the numeric variables
|
||||||
|
## make sure that even empty level of factor purpose = verw (dat[[4]]) is included
|
||||||
|
for (i in setdiff(1:21, c(2,4,5,13)))
|
||||||
|
dat[[i]] <- factor(dat[[i]])
|
||||||
|
## factor purpose
|
||||||
|
dat[[4]] <- factor(dat[[4]], levels=as.character(0:10))
|
||||||
|
|
||||||
|
## assign level codes
|
||||||
|
## make intrinsically ordered factors into class ordered
|
||||||
|
levels(dat$credit_risk) <- c("bad", "good")
|
||||||
|
levels(dat$status) = c("no checking account",
|
||||||
|
"... < 0 DM",
|
||||||
|
"0<= ... < 200 DM",
|
||||||
|
"... >= 200 DM / salary for at least 1 year")
|
||||||
|
## "critical account/other credits elsewhere" was
|
||||||
|
## "critical account/other credits existing (not at this bank)",
|
||||||
|
levels(dat$credit_history) <- c(
|
||||||
|
"delay in paying off in the past",
|
||||||
|
"critical account/other credits elsewhere",
|
||||||
|
"no credits taken/all credits paid back duly",
|
||||||
|
"existing credits paid back duly till now",
|
||||||
|
"all credits at this bank paid back duly")
|
||||||
|
levels(dat$purpose) <- c(
|
||||||
|
"others",
|
||||||
|
"car (new)",
|
||||||
|
"car (used)",
|
||||||
|
"furniture/equipment",
|
||||||
|
"radio/television",
|
||||||
|
"domestic appliances",
|
||||||
|
"repairs",
|
||||||
|
"education",
|
||||||
|
"vacation",
|
||||||
|
"retraining",
|
||||||
|
"business")
|
||||||
|
levels(dat$savings) <- c("unknown/no savings account",
|
||||||
|
"... < 100 DM",
|
||||||
|
"100 <= ... < 500 DM",
|
||||||
|
"500 <= ... < 1000 DM",
|
||||||
|
"... >= 1000 DM")
|
||||||
|
levels(dat$employment_duration) <-
|
||||||
|
c( "unemployed",
|
||||||
|
"< 1 yr",
|
||||||
|
"1 <= ... < 4 yrs",
|
||||||
|
"4 <= ... < 7 yrs",
|
||||||
|
">= 7 yrs")
|
||||||
|
dat$installment_rate <- ordered(dat$installment_rate)
|
||||||
|
levels(dat$installment_rate) <- c(">= 35",
|
||||||
|
"25 <= ... < 35",
|
||||||
|
"20 <= ... < 25",
|
||||||
|
"< 20")
|
||||||
|
levels(dat$other_debtors) <- c(
|
||||||
|
"none",
|
||||||
|
"co-applicant",
|
||||||
|
"guarantor"
|
||||||
|
)
|
||||||
|
## female : nonsingle was female : divorced/separated/married
|
||||||
|
## widowed females are not mentioned in the code table
|
||||||
|
levels(dat$personal_status_sex) <- c(
|
||||||
|
"male : divorced/separated",
|
||||||
|
"female : non-single or male : single",
|
||||||
|
"male : married/widowed",
|
||||||
|
"female : single")
|
||||||
|
dat$present_residence <- ordered(dat$present_residence)
|
||||||
|
levels(dat$present_residence) <- c("< 1 yr",
|
||||||
|
"1 <= ... < 4 yrs",
|
||||||
|
"4 <= ... < 7 yrs",
|
||||||
|
">= 7 yrs")
|
||||||
|
## "building soc. savings agr./life insurance",
|
||||||
|
## was "building society savings agreement/life insurance"
|
||||||
|
levels(dat$property) <- c(
|
||||||
|
"unknown / no property",
|
||||||
|
"car or other",
|
||||||
|
"building soc. savings agr./life insurance",
|
||||||
|
"real estate"
|
||||||
|
)
|
||||||
|
levels(dat$other_installment_plans) <- c(
|
||||||
|
"bank",
|
||||||
|
"stores",
|
||||||
|
"none"
|
||||||
|
)
|
||||||
|
levels(dat$housing) <- c("for free", "rent", "own")
|
||||||
|
dat$number_credits <- ordered(dat$number_credits)
|
||||||
|
levels(dat$number_credits) <- c("1", "2-3", "4-5", ">= 6")
|
||||||
|
## manager/self-empl./highly qualif. employee was
|
||||||
|
## management/self-employed/highly qualified employee/officer
|
||||||
|
levels(dat$job) <- c(
|
||||||
|
"unemployed/unskilled - non-resident",
|
||||||
|
"unskilled - resident",
|
||||||
|
"skilled employee/official",
|
||||||
|
"manager/self-empl./highly qualif. employee"
|
||||||
|
)
|
||||||
|
levels(dat$people_liable) <- c("3 or more", "0 to 2")
|
||||||
|
levels(dat$telephone) <- c("no", "yes (under customer name)")
|
||||||
|
levels(dat$foreign_worker) <- c("yes", "no")
|
||||||
|
|
||||||
|
## checks against fahrmeir table
|
||||||
|
tabs <-
|
||||||
|
list(status = round(100*prop.table(xtabs(~status+credit_risk, dat),2),2),
|
||||||
|
credit_history = round(100*prop.table(xtabs(~credit_history+credit_risk, dat),2),2),
|
||||||
|
purpose = round(100*prop.table(xtabs(~purpose+credit_risk, dat),2),2),
|
||||||
|
savings = round(100*prop.table(xtabs(~savings+credit_risk, dat),2),2),
|
||||||
|
employment_duration = round(100*prop.table(xtabs(~employment_duration+credit_risk, dat),2),2),
|
||||||
|
installment_rate = round(100*prop.table(xtabs(~installment_rate+credit_risk, dat),2),2),
|
||||||
|
personal_status_sex = round(100*prop.table(xtabs(~personal_status_sex+credit_risk, dat),2),2),
|
||||||
|
other_debtors = round(100*prop.table(xtabs(~other_debtors+credit_risk, dat),2),2),
|
||||||
|
present_residence = round(100*prop.table(xtabs(~present_residence+credit_risk, dat),2),2),
|
||||||
|
property = round(100*prop.table(xtabs(~property+credit_risk, dat),2),2),
|
||||||
|
other_installment_plans = round(100*prop.table(xtabs(~other_installment_plans+credit_risk, dat),2),2),
|
||||||
|
housing = round(100*prop.table(xtabs(~housing+credit_risk, dat),2),2),
|
||||||
|
number_credits = round(100*prop.table(xtabs(~number_credits+credit_risk, dat),2),2),
|
||||||
|
job = round(100*prop.table(xtabs(~job+credit_risk, dat),2),2),
|
||||||
|
people_liable = round(100*prop.table(xtabs(~people_liable+credit_risk, dat),2),2),
|
||||||
|
telephone = round(100*prop.table(xtabs(~telephone+credit_risk, dat),2),2),
|
||||||
|
foreign_worker = round(100*prop.table(xtabs(~foreign_worker+credit_risk, dat),2),2))
|
||||||
|
|
||||||
|
## variables for which a tab entry is available
|
||||||
|
## (all except 2, 5 and 13)
|
||||||
|
tabwhich <- setdiff(1:20, c(2,5,13))
|
||||||
|
|
||||||
|
print(tabs)
|
Loading…
Reference in New Issue
Block a user