packages <- c("tidyverse","stargazer","AER","asbio","tigerstats","readxl","foreign","wooldridge","moderndive","gridExtra","haven","car") ## This is how you define an object (which is a vector here)
install.packages(packages, repos='http://cran.us.r-project.org') # Installing packages at once
lapply(packages, library, character.only = T) # Loading the packages

Reading data files into R

Below are some ways to load the data files in different formats. To load excel and Stata data files, you will need to install the packages “readxl” and “foreign” respectively.

bwsmoking_excel <- read_xlsx("S:\\Baruch\\ECO 4000\\Spring2022\\Datasets\\Birthweight and Smoking\\birthweight_smoking.xlsx")

## Read a csv file

bwsmoking_csv <- read_csv("S:\\Baruch\\ECO 4000\\Spring2022\\Datasets\\Birthweight and Smoking\\bwsmoking.csv")

## Read a STATA file (will require package named "haven")

bwsmoking_stata <- read_dta("S:\\Baruch\\ECO 4000\\Spring2022\\Datasets\\Birthweight and Smoking\\birthweight_smoking.dta")

## Read an RDS file 

bwsmoking_R <- readRDS("S:\\Baruch\\ECO 4000\\Spring2022\\Datasets\\Birthweight and Smoking\\bwsmoking.rds")

One Sample Hypothesis Testing

These examples are from the practice problem set provided to the students.

Problem 9

A company claims that its soup machines deliver exactly 10.0 ounces of soup—no more, no less. A researcher samples 100 bowls of soup and finds that:

\(\overline{X}\)= 10.28 ounces
s = 1.20 ounces
Test the company’s claim at 5% and 1% significance level.

We denote null hypothesis as \(H_{0}\) and alternative hypothesis as \(H_{1}\).

\[H_{0} :\mu = 10\] \[H_{1} :\mu \neq 10\]

We define z-statistic in terms of the sample mean, sample size, and the population standard deviation \(\sigma\)

\(z = \frac{\overline{X} - \mu_{0}}{\bigg(\frac{\sigma}{\sqrt{n}}\bigg)}\)

We reject the null hypothesis if \(|z| \geq |z_{\frac{\alpha}{2}}|\), where \(|z|\) is the absolute value of the z-statistic we calculated and \(|z_{\frac{\alpha}{2}}|\) is the critical value we obtain from the table. Notice that we use \(|z_{\frac{\alpha}{2}}|\) here because we are conducting a two-sided test.

We reject the null hypothesis if p-value \(\leq \frac{\alpha}{2}\), where \(\alpha\) is a significance level.(we will often use 0.01,0.05,and 0.1 values for \(\alpha\) for our purposes.)

# A function to calculate z scores and compare them with critical values of z (ones we obtain from the table)
z_test <- function(mu,x_bar,s,n,alpha,alternative){
  sd = s/sqrt(n)
  z <- (x_bar - mu)/(sd)
  if(alternative == "two-sided"){
    z_alpha = qnorm((1-alpha/2)) 
  }
  else if(alternative == "less"){
  z_alpha = -qnorm((1-alpha))
  }
  else if(alternative == "greater"){
    z_alpha = qnorm((1-alpha))
  }
  else("NA")
  
  z_list = list("|z_calculated|" = abs(z), "|z_critical|" = abs(z_alpha))
  return(z_list)
}
set.seed(21)
Q_9 <- tibble(X = rnorm(mean = 10.28, sd = 0.12, n = 100)) # Our simulated data
z_05 <- z_test(10,10.28,s = 1.2,100,0.05, alternative = "two-sided") ## a two-sided test at 5 % significance level
z_05
## $`|z_calculated|`
## [1] 2.333333
## 
## $`|z_critical|`
## [1] 1.959964
t.test(x = Q_9$X, mu = 10, alternative = "two.sided")
## 
##  One Sample t-test
## 
## data:  Q_9$X
## t = 23.422, df = 99, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 10
## 95 percent confidence interval:
##  10.26424 10.31316
## sample estimates:
## mean of x 
##   10.2887

As we notice here, the absolute value of z-calculated = 2.33 is greater than the absolute value of z-critical = 1.96. Therefore, we reject the null hypothesis being \(\mu = 10\)

z_01 <- z_test(10,10.28,s = 1.2,100,0.01, alternative = "two-sided") ## a two-sided test at 1 % significance level
z_01
## $`|z_calculated|`
## [1] 2.333333
## 
## $`|z_critical|`
## [1] 2.575829

As we notice above, the absolute value of z-calculated = 2.33 is less than the absolute value of z-critical = 2.58. Therefore,we fail reject the null hypothesis being \(\mu = 10\)

Problem - 6

A manufacturer produces drill bits with an intended life of at least 580 hours and a standard deviation of 30 hours. A quality control scientist draws a sample of 100 bits and finds \(\overline{X}=577\). Test at \(\alpha =.05\) to see if the machinery needs adjusting.

\[H_{0} :\mu \geq 580\] \[H_{1} :\mu < 580\]

set.seed(121)
Q_6 <- tibble(X = rnorm(mean = 577, sd = 30, n = 100)) # dataset with N ~ (577,900) (it does not have to be normal though. Recall Central Limit Theorem!)
z_one_sided <- z_test(mu = 580, x_bar = 577, s = 30, n = 100, alpha = 0.05, alternative = "less")
z_one_sided
## $`|z_calculated|`
## [1] 1
## 
## $`|z_critical|`
## [1] 1.644854

As we can notice above, the absolute value of z-calculated = 1 is less than the absolute value of z-critical = 1.64. Therefore,we fail reject the null hypothesis being \(\mu = 580\).

Problem - 4

A drug company that manufactures a diet drug claims that those using the drug for 30 days will lose at least 15 pounds. You sample 30 people who have used the drug and find that the average weight loss was 12 pounds with a standard deviation of 5 pounds. (Hint : When sample is small enough i.e. \(n \leq 30\) use a t-test). Test the claim at the .05 significance level.

\[H_{0} :\mu \geq 15\] \[H_{1} :\mu < 15\]

t_test <- function(mu,x_bar,s,n,alpha,alternative){
  sd = s/sqrt(n)
  t <- (x_bar - mu)/(sd)
  if(alternative == "two-sided"){
    t_alpha = qt(alpha/2,df = n-1)
  }
  else if(alternative == "less"){
    t_alpha = - qt(1-alpha, df= n-1)
  }
  else if(alternative == "greater"){
    t_alpha = qt(1-alpha, df= n-1)
  }
  else("NA")
  
  t_list = list("|t_calculated|" = abs(t), "|t_critical|" = abs(t_alpha))
  return(t_list)
}


t_one_sided <- t_test(15,12,s = 5,n=30,0.05,alternative = "less")

t_one_sided
## $`|t_calculated|`
## [1] 3.286335
## 
## $`|t_critical|`
## [1] 1.699127
z_one_sided <- z_test(15,12,s = 5,n=30,0.05,alternative = "less")
z_one_sided
## $`|z_calculated|`
## [1] 3.286335
## 
## $`|z_critical|`
## [1] 1.644854
pvalt <- 2 * pt(-t_one_sided$`|t_calculated|`,29) # p value calculation
pvalt
## [1] 0.002658998

As we notice above, the absolute value of t-calculated = 3.29 is greater than the absolute value of t-critical = 1.69. Therefore,we reject the null hypothesis being \(\mu = 15\).

p-value here is 0.002 which is less than 0.05. Therefore,we reject the null hypothesis being \(\mu = 15\).

Monte Carlo Simulation to Confirm the Unbiasedness of the Slope Parameter

## POPULATION PARAMETERS
B0 = 2
B1 = 0.5




n = 10000

coeffs <- tibble(b_0 = rep(0,n),b_1 = rep(0,n))

for (i in 1:n) {
  
  dat1 <- tibble(X = 1:50, u = rnorm(50), Y = B0 + B1*X + u)
  reg <- lm(Y~X, data = dat1)
  model <- summary(reg)
  
  coeffs$b_0[i] = model$coefficients[1,1]
  coeffs$b_1[i] = model$coefficients[2,1]
  
}

### With increased error

coeffs_2 <- tibble(b_02 = rep(0,n),b_12 = rep(0,n))

for (i in 1:n) {
  
  dat2 <- tibble(X = 1:50, u = rnorm(50), Y = B0 + B1*X + 2*u)
  reg2 <- lm(Y~X, data = dat2)
  model2 <- summary(reg2)
  
  coeffs_2$b_02[i] = model2$coefficients[1,1]
  coeffs_2$b_12[i] = model2$coefficients[2,1]
  
}


##### Plot the data

ggplot(data = coeffs, aes(x = b_1), color = "blue")+
  geom_density()+
  geom_density(data = coeffs_2, aes(x = b_12), color = "green")+
  xlab("beta_hat")

Simple Linear Regression : Results Interpretation

For this exercise we will require to install r package “wooldridge”. This package contains all the data that are used in Wooldridge’s Introductory Econometrics textbook. We will use those data sets for our purpose.

E1. For the population of chief executive officers, let Y be annual salary (salary) in thousands of dollars.Thus, y = 856.3 indicates an annual salary of $856,300, and y = 1,452.6 indicates a salary of $1,452,600. Let X be the average return on equity (roe) for the CEO’s firm for the previous three years. (Return on equity is defined in terms of net income as a percentage of common equity.) For example, if roe = 10, then average return on equity is 10%.

We postulate that our model is

\[salary = \beta_{0} + \beta_{1} roe + u\]

The slope parameter \(\beta_{1}\) measure the change in annual salary when the return on equity increases by one percentage point. Since, a higher roe is good for the company, we expect \(\beta_{1}\) to be positive.

load("S:\\Baruch\\ECO 4000\\Spring2022\\Datasets\\ceosal1.RData")
ceo1 <- data

## Summary Statistics

stargazer(ceo1, type = "html",digits = 2, title = "Descriptive Statistics")
Descriptive Statistics
Statistic N Mean St. Dev. Min Max
salary 209 1,281.12 1,372.35 223 14,822
pcsalary 209 13.28 32.63 -61 212
sales 209 6,923.79 10,633.27 175.20 97,649.90
roe 209 17.18 8.52 0.50 56.30
pcroe 209 10.80 97.22 -98.90 977.00
ros 209 61.80 68.18 -58 418
indus 209 0.32 0.47 0 1
finance 209 0.22 0.42 0 1
consprod 209 0.29 0.45 0 1
utility 209 0.17 0.38 0 1
lsalary 209 6.95 0.57 5.41 9.60
lsales 209 8.29 1.01 5.17 11.49

Always start with looking at the summary statistics of your data. It is helpful to get the feel of the data before we start our analysis.The data set CEOSAL1 contains information on 209 CEOs for the year 1990; these data were obtained from Business Week (5/6/91). In this sample, the average annual salary is $1,281,120, with the smallest and largest being $223,000 and $14,822,000, respectively. The average return on equity for the years 1988, 1989, and 1990 is 17.18%, with the smallest and largest values being 0.5% and 56.3%, respectively.

## plot the data 
p1 <- ggplot(data = ceo1, aes(x = roe, y = salary)) +
  geom_point()+
  geom_smooth(method = "lm")+
  xlab("Return on Equity")+
  ylab("Salary")

p2 <- ggplot(data = ceo1, aes(x = roe, y = lsalary)) +  ## Just for your reference
  geom_point()+
  geom_smooth(method = "lm")+
  xlab("Return on Equity")+
  ylab("Log Salary")

gridExtra::grid.arrange(p1,p2)

We have the results after fitting the model. Out fitted model looks like this

\[\widehat{salary} = 963.191 + 18.501 roe\]

model_1 <- lm(data = ceo1, salary ~ roe) # level - level
model_2 <- lm(data = ceo1, lsalary ~ roe) # log-level
stargazer(model_1,model_2, type = "html",dep.var.labels = c("Salary","Log Salary"), title = "CEO Salary and Return on Equity", style = "qje",notes.append = FALSE,notes = c("<sup>&sstarf;</sup>p<0.1; <sup>&sstarf;&sstarf;</sup>p<0.05; <sup>&sstarf;&sstarf;&sstarf;</sup>p<0.01"))
CEO Salary and Return on Equity
Salary Log Salary
(1) (2)
roe 18.501* 0.014***
(11.123) (0.005)
Constant 963.191*** 6.712***
(213.240) (0.087)
N 209 209
R2 0.013 0.043
Adjusted R2 0.008 0.039
Residual Std. Error (df = 207) 1,366.555 0.555
F Statistic (df = 1; 207) 2.767* 9.408***
Notes: p<0.1; ⋆⋆p<0.05; ⋆⋆⋆p<0.01

where the intercept and slope estimates have been rounded to three decimal places; we use “salary hat” to indicate that this is an estimated equation.

How do we interpret the equation?

First, if the return on equity is zero, roe = 0, then the predicted salary is the intercept, 963.191, which equals $963,191 because salary is measured in thousands. Next, we can write the predicted change in salary as a function of the change in roe: \(\widehat{\Delta salary} = 18.501 (\Delta roe)\). This means that if the return on equity increases by one percentage point, \((\Delta roe) = 1\) , then salary is predicted to change by about 18.5, or $18,500.

Because it is a linear equation, this is the estimated change regardless of the initial salary.

Question : What is the predicted salary of a CEO if the return on equity is 30 percent?

We can use our fitted model, \(\widehat{salary}, = 963.191 + 18.501 roe\). Plug in the value of roe = 30. i.e. \(\widehat{salary} = 963.191 + 18.501 (30) = 1,518,221\), which is over $1.5 million dollars. This is the predicted value that our model gives us. Now, keep in mind as we talked about it in the class, the assumption of zero conditional mean does not satisfy in this case. There are other variables that could potentially affect the salary of a CEO.

ceo1 <- ceo1%>%
  mutate(lroe = log(roe))
model_3 <- lm(data = ceo1, salary ~ lroe) # level-log
model_4 <- lm(data = ceo1, lsalary ~ roe) # log-level
model_5 <- lm(data = ceo1, lsalary ~ lroe) # log-log
stargazer(model_3,model_4, model_5,type = "html",dep.var.labels = c("Salary","Log Salary","Log Salary"), title = "CEO Salary and Return on Equity", style = "qje",notes.append = FALSE,notes = c("<sup>&sstarf;</sup>p<0.1; <sup>&sstarf;&sstarf;</sup>p<0.05; <sup>&sstarf;&sstarf;&sstarf;</sup>p<0.01"))
CEO Salary and Return on Equity
Salary Log Salary
(1) (2) (3)
lroe 255.311 0.170**
(173.884) (0.071)
roe 0.014***
(0.005)
Constant 586.596 6.712*** 6.489***
(482.396) (0.087) (0.197)
N 209 209 209
R2 0.010 0.043 0.027
Adjusted R2 0.006 0.039 0.022
Residual Std. Error (df = 207) 1,368.548 0.555 0.560
F Statistic (df = 1; 207) 2.156 9.408*** 5.689**
Notes: p<0.1; ⋆⋆p<0.05; ⋆⋆⋆p<0.01

Simple Linear Regression with Logarithms : Results Interpretation

Analysis of Wage and Education Data, (1976 CPS)

## Summary Statistics

stargazer(wage1, type = "html",digits = 2, title = "Descriptive Statistics")
Descriptive Statistics
Statistic N Mean St. Dev. Min Max
wage 526 5.90 3.69 0.53 24.98
educ 526 12.56 2.77 0 18
exper 526 17.02 13.57 1 51
tenure 526 5.10 7.22 0 44
nonwhite 526 0.10 0.30 0 1
female 526 0.48 0.50 0 1
married 526 0.61 0.49 0 1
numdep 526 1.04 1.26 0 6
smsa 526 0.72 0.45 0 1
northcen 526 0.25 0.43 0 1
south 526 0.36 0.48 0 1
west 526 0.17 0.38 0 1
construc 526 0.05 0.21 0 1
ndurman 526 0.11 0.32 0 1
trcommpu 526 0.04 0.20 0 1
trade 526 0.29 0.45 0 1
services 526 0.10 0.30 0 1
profserv 526 0.26 0.44 0 1
profocc 526 0.37 0.48 0 1
clerocc 526 0.17 0.37 0 1
servocc 526 0.14 0.35 0 1
lwage 526 1.62 0.53 -0.63 3.22
expersq 526 473.44 616.04 1 2,601
tenursq 526 78.15 199.43 0 1,936
### Wage-Education
w1 <- ggplot(data = wage1, aes(x = educ, y = wage)) +
  geom_point()+
  geom_smooth(method = "lm")+
  xlab("Years of Education")+
  ylab("Hourly Wage")+
  labs(title = "Wages vs Education, 1976")

w2 <- ggplot(data = wage1, aes(x = educ, y = lwage)) +
  geom_point()+
  geom_smooth(method = "lm")+
  xlab("Years of Education")+
  ylab("Hourly Wage(in log terms)")+
  labs(title = "Wages vs Education, 1976")

gridExtra::grid.arrange(w1,w2)

## Regression Analysis
wmodel_1 <- lm(data = wage1, wage ~ educ)
wmodel_2 <- lm(data = wage1, lwage ~ educ)

wage1 = wage1%>%
  filter(educ != 0)%>%
  mutate(leduc = log(educ))
wmodel_3 <- lm(data = wage1, wage ~ leduc)
wmodel_4 <- lm(data = wage1, lwage ~ leduc)
stargazer(wmodel_1, wmodel_2,wmodel_3,wmodel_4,type = "html",dep.var.labels = c("wage","log wage","wage","log wage"), title = "Wage and Education", style = "qje",notes.append = FALSE,notes = c("<sup>&sstarf;</sup>p<0.1; <sup>&sstarf;&sstarf;</sup>p<0.05; <sup>&sstarf;&sstarf;&sstarf;</sup>p<0.01"))
Wage and Education
wage log wage wage log wage
(1) (2) (3) (4)
educ 0.541*** 0.083***
(0.053) (0.008)
leduc 5.330*** 0.825***
(0.608) (0.086)
Constant -0.905 0.584*** -7.460*** -0.445**
(0.685) (0.097) (1.532) (0.218)
N 526 526 524 524
R2 0.165 0.186 0.128 0.149
Adjusted R2 0.163 0.184 0.127 0.147
Residual Std. Error 3.378 (df = 524) 0.480 (df = 524) 3.455 (df = 522) 0.491 (df = 522)
F Statistic 103.363*** (df = 1; 524) 119.582*** (df = 1; 524) 76.849*** (df = 1; 522) 91.119*** (df = 1; 522)
Notes: p<0.1; ⋆⋆p<0.05; ⋆⋆⋆p<0.01
  • In regression 1, 1 year increase in education is associated with $0.541 increase in hourly wage.

  • In regression 2, an increase in education by 1 year is associated with \(100 \times \hat{\beta_{1}}\) = 100 x 0.083 = 8.3 % increase in hourly wage.

  • In regression 3, 1 % increase in education is associated with \(\frac{1}{100} \times \hat{\beta_{1}}\) = \(\frac{1}{100} \times 5.33\) = $ 0.0533 increase in hourly wage.

  • In regression 4, 1 % increase in education is associated with \(\hat{\beta_{1}}\) % = 0.83 % increase in hourly wage. (This gives us Elasticity! Recall from your economics class.)

We can not compare \(R^{2}\) values when the dependent variables are different. For instance, in the table above, we are not able to compare regression 1 and 2 since the dependent variables are wage and log(wage) respectively. In these situations we rely on our knowledge of economic theory and make decisions based on that. For example, it is standard practice to have a regression model like regression 2 in labor economics.

Multiple Regression Analysis : Results Interpretation

m1 <- ggplot(data = wage1, aes(x = educ, y = lwage)) +
  geom_point()+
  geom_smooth(method = "lm")+
  xlab("Years of Education")+
  ylab("Hourly Wage")+
  labs(title = "Wages vs Education, 1976")

m2 <- ggplot(data = wage1, aes(x = exper, y = lwage)) +
  geom_point()+
  geom_smooth(method = "lm")+
  xlab("Years of Experience")+
  ylab("Hourly Wage)")+
  labs(title = "Wages vs Experience, 1976")

m3 <- ggplot(data = wage1, aes(x = tenure, y = lwage)) +
  geom_point()+
  geom_smooth(method = "lm")+
  xlab("Years with Employer")+
  ylab("Hourly Wage)")+
  labs(title = "Wages vs Tenure, 1976")

m4 <- ggplot(data = wage1, aes(x = married, y = lwage)) +
  geom_point()+
  geom_smooth(method = "lm")+
  xlab("Marital Status")+
  ylab("Hourly Wage)")+
  labs(title = "Wages vs Marital Status, 1976")

gridExtra::grid.arrange(m1,m2,m3,m4)

## Regression Analysis
multiple_1 <- lm(data = wage1, wage ~ educ)
multiple_2 <- lm(data = wage1, lwage ~ educ)
multiple_3 <- lm(data = wage1, lwage ~ educ + exper)
multiple_4 <- lm(data = wage1, lwage ~ educ + exper + tenure)
multiple_5 <- lm(data = wage1, lwage ~ educ + exper + tenure + married)
stargazer(multiple_1, multiple_2,multiple_3,multiple_4,multiple_5,type = "html",dep.var.labels = c("wage","log wage"), title = "Wage and Education", style = "qje",notes.append = FALSE,notes = c("<sup>&sstarf;</sup>p<0.1; <sup>&sstarf;&sstarf;</sup>p<0.05; <sup>&sstarf;&sstarf;&sstarf;</sup>p<0.01"))
Wage and Education
wage log wage
(1) (2) (3) (4) (5)
educ 0.572*** 0.087*** 0.103*** 0.097*** 0.092***
(0.055) (0.008) (0.008) (0.008) (0.008)
exper 0.010*** 0.004** 0.002
(0.002) (0.002) (0.002)
tenure 0.022*** 0.021***
(0.003) (0.003)
married 0.167***
(0.042)
Constant -1.302* 0.525*** 0.150 0.217** 0.218**
(0.714) (0.101) (0.112) (0.107) (0.106)
N 524 524 524 524 524
R2 0.169 0.191 0.256 0.322 0.342
Adjusted R2 0.168 0.189 0.253 0.318 0.337
Residual Std. Error 3.372 (df = 522) 0.479 (df = 522) 0.460 (df = 521) 0.439 (df = 520) 0.433 (df = 519)
F Statistic 106.506*** (df = 1; 522) 123.032*** (df = 1; 522) 89.545*** (df = 2; 521) 82.375*** (df = 3; 520) 67.558*** (df = 4; 519)
Notes: p<0.1; ⋆⋆p<0.05; ⋆⋆⋆p<0.01

Binary Independent Variables and Interaction Terms

## Regression Analysis
mod_1 <- lm(data = wage1, lwage ~ female)
mod_2 <- lm(data = wage1, lwage ~ female + educ)
mod_3 <- lm(data = wage1, lwage ~ female + educ + exper)
mod_4 <- lm(data = wage1, lwage ~ female + educ + exper + tenure)
mod_5 <- lm(data = wage1, lwage ~ female + educ + exper + tenure + married)
mod_6 <- lm(data = wage1, lwage ~ female + educ + exper + tenure + female*educ)
mod_7 <- lm(data = wage1, lwage ~ female + educ + exper + tenure + female*educ + female*married)
stargazer(mod_1, mod_2,mod_3,mod_4,mod_5,mod_6,mod_7,type = "html",dep.var.labels = c("log wage"), title = "Wage and Gender", style = "qje",notes.append = FALSE,notes = c("<sup>&sstarf;</sup>p<0.1; <sup>&sstarf;&sstarf;</sup>p<0.05; <sup>&sstarf;&sstarf;&sstarf;</sup>p<0.01"))
Wage and Gender
log wage
(1) (2) (3) (4) (5) (6) (7)
female -0.396*** -0.365*** -0.348*** -0.306*** -0.290*** -0.407** -0.252
(0.043) (0.039) (0.037) (0.037) (0.037) (0.186) (0.184)
educ 0.082*** 0.097*** 0.093*** 0.089*** 0.090*** 0.085***
(0.007) (0.007) (0.007) (0.007) (0.009) (0.009)
exper 0.010*** 0.005*** 0.003* 0.005*** 0.003**
(0.001) (0.002) (0.002) (0.002) (0.002)
tenure 0.017*** 0.017*** 0.017*** 0.015***
(0.003) (0.003) (0.003) (0.003)
married 0.125*** 0.291***
(0.040) (0.055)
female:educ 0.008 0.012
(0.014) (0.014)
female:married -0.317***
(0.074)
Constant 1.814*** 0.759*** 0.407*** 0.428*** 0.418*** 0.464*** 0.369***
(0.030) (0.097) (0.107) (0.104) (0.103) (0.122) (0.121)
N 524 524 524 524 524 524 524
R2 0.138 0.308 0.362 0.401 0.412 0.401 0.433
Adjusted R2 0.137 0.305 0.358 0.396 0.406 0.395 0.425
Residual Std. Error 0.494 (df = 522) 0.443 (df = 521) 0.426 (df = 520) 0.413 (df = 519) 0.410 (df = 518) 0.414 (df = 518) 0.403 (df = 516)
F Statistic 83.868*** (df = 1; 522) 115.919*** (df = 2; 521) 98.193*** (df = 3; 520) 86.774*** (df = 4; 519) 72.579*** (df = 5; 518) 69.388*** (df = 5; 518) 56.191*** (df = 7; 516)
Notes: p<0.1; ⋆⋆p<0.05; ⋆⋆⋆p<0.01

Parallel Slopes Model

wage_new <- wage1 %>%
  mutate(Gender = if_else(female == 1,"female","male"))
ggplot(data = wage_new, aes(x = educ, y = lwage, color = Gender)) +
  geom_point()+
  geom_parallel_slopes(se = FALSE)

Dummy Variable Trap

wage_new <- wage1%>%
  mutate(male = if_else(female == 1,0,1))
mod_m <- lm(data = wage_new, lwage ~ male)
mod_fm <- lm(data = wage_new, lwage ~ female)
mod_red <- lm(data = wage_new, lwage ~ female + male)
stargazer(mod_m,mod_fm,mod_red,type = "html",dep.var.labels = c("log wage"), title = "Wage and Gender", style = "qje",notes.append = FALSE,notes = c("<sup>&sstarf;</sup>p<0.1; <sup>&sstarf;&sstarf;</sup>p<0.05; <sup>&sstarf;&sstarf;&sstarf;</sup>p<0.01"))
Wage and Gender
log wage
(1) (2) (3)
male 0.396***
(0.043)
female -0.396*** -0.396***
(0.043) (0.043)
Constant 1.418*** 1.814*** 1.814***
(0.031) (0.030) (0.030)
N 524 524 524
R2 0.138 0.138 0.138
Adjusted R2 0.137 0.137 0.137
Residual Std. Error (df = 522) 0.494 0.494 0.494
F Statistic (df = 1; 522) 83.868*** 83.868*** 83.868***
Notes: p<0.1; ⋆⋆p<0.05; ⋆⋆⋆p<0.01

With Multiple Dummies

Let us go back to our old example on salaries of CEOs

  • salary: 1990 salary, thousands $

  • pcsalary: percent change salary, 89-90

  • sales: 1990 firm sales, millions $

  • roe: return on equity, 88-90 avg

  • pcroe: percent change roe, 88-90

  • ros: return on firm’s stock, 88-90

  • indus: =1 if industrial firm

  • finance: =1 if financial firm

  • consprod: =1 if consumer product firm

  • utility: =1 if transport. or utilties

  • lsalary: natural log of salary

  • lsales: natural log of sales

model_md <- lm(data = ceosal1, salary ~ roe + indus + finance + consprod)
model_md1 <- lm(data = ceosal1, salary ~ roe + indus + finance + consprod + utility)
stargazer(model_md, model_md1,type = "html",dep.var.labels   = "Salary", title = "CEO Salary and Return on Equity", style = "qje",notes.append = FALSE,notes = c("<sup>&sstarf;</sup>p<0.1; <sup>&sstarf;&sstarf;</sup>p<0.05; <sup>&sstarf;&sstarf;&sstarf;</sup>p<0.01"))
CEO Salary and Return on Equity
Salary
(1) (2)
roe 2.499 2.499
(12.397) (12.397)
indus 396.514 396.514
(286.959) (286.959)
finance 609.637** 609.637**
(300.829) (300.829)
consprod 966.333*** 966.333***
(315.436) (315.436)
utility
Constant 699.470*** 699.470***
(264.622) (264.622)
N 209 209
R2 0.062 0.062
Adjusted R2 0.044 0.044
Residual Std. Error (df = 204) 1,342.054 1,342.054
F Statistic (df = 4; 204) 3.374** 3.374**
Notes: p<0.1; ⋆⋆p<0.05; ⋆⋆⋆p<0.01