# Variable selection structure

### Basic Varible selection

In [1]:
import cvxpy as cp
import numpy as np 
from sklearn.datasets import fetch_openml

# Load Boston dataset - high dimensional
boston = fetch_openml(name='boston_corrected', version=1, as_frame=False)
X, y = boston.data, boston.target
print(f"Boston dataset: {X.shape}")
X = X[:, 1:]
feature_names = [
  'TOWN_ID',
  'TRACT',
  'LON',
  'LAT',
  'MEDV',
  'CMEDV',
  'CRIM',
  'ZN',
  'INDUS',
  'CHAS',
  'NOX',
  'RM',
  'AGE',
  'DIS',
  'RAD',
  'TAX',
  'PTRATIO',
  'B']

M = 0.8
lamda = 100
k = 4

def sparse_basic(X,M,k):
    n, p = X.shape
    beta = cp.Variable(p) 
    q = cp.Variable(p, boolean=True)

    constraints= []

    for i in range(p):
        constraints.append(beta[i] >= -M*q[i])
        constraints.append(beta[i] <= M*q[i])

    constraints.append(cp.sum(q) <= k)

    objective = cp.Minimize(cp.sum_squares(y - X @ beta))

    problem = cp.Problem(objective, constraints)
    problem.solve(solver=cp.MOSEK, verbose=False)
    
    print(f"Status: {problem.status}")
    print(f"Selected features: {np.where(q.value > 0.5)[0]}")
    print(f"Binary variables: {q.value}")
## We also need to print the selected features
    selected_indices = np.where(q.value > 0.5)[0]
        
    print(f"\n=== SELECTED FEATURES ===")
    print(f"Selected {len(selected_indices)} out of {p} features:")
    print("-" * 50)
        
    for idx in selected_indices:
        feature_name = feature_names[idx]
        coefficient = beta.value[idx]
        print(f"  {feature_name:<10} | Coefficient: {coefficient:>8.4f}")

sparse_basic(X,M,k)

Boston dataset: (506, 19)




Status: optimal
Selected features: [ 2  5 11 12]
Binary variables: [0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0.]

=== SELECTED FEATURES ===
Selected 4 out of 18 features:
--------------------------------------------------
  LON        | Coefficient:  -0.2905
  CMEDV      | Coefficient:  -0.4233
  RM         | Coefficient:  -0.8000
  AGE        | Coefficient:   0.0959


## Ridge Regression

In [5]:
def sparse_ridge(X,M,k,lamda):
    n, p = X.shape
    beta = cp.Variable(p) 
    q = cp.Variable(p, boolean=True)

    constraints= []

    for i in range(p):
        constraints.append(beta[i] >= -M*q[i])
        constraints.append(beta[i] <= M*q[i])

    constraints.append(cp.sum(q) <= k)

    objective = cp.Minimize(cp.sum_squares(y - X @ beta)+lamda*cp.sum_squares(beta))

    problem = cp.Problem(objective, constraints)
    problem.solve(solver=cp.MOSEK, verbose=False)
    
    print(f"Status: {problem.status}")
    print(f"Selected features: {np.where(q.value > 0.5)[0]}")
    print(f"Binary variables: {q.value}")
## We also need to print the selected features
    selected_indices = np.where(q.value > 0.5)[0]
        
    print(f"\n=== SELECTED FEATURES ===")
    print(f"Selected {len(selected_indices)} out of {p} features:")
    print("-" * 50)
        
    for idx in selected_indices:
        feature_name = feature_names[idx]
        coefficient = beta.value[idx]
        print(f"  {feature_name:<10} | Coefficient: {coefficient:>8.4f}")

sparse_ridge(X,M,k,lamda)

Status: optimal
Selected features: [ 2  5 11 12]
Binary variables: [0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0.]

=== SELECTED FEATURES ===
Selected 4 out of 18 features:
--------------------------------------------------
  LON        | Coefficient:  -0.2898
  CMEDV      | Coefficient:  -0.4218
  RM         | Coefficient:  -0.8000
  AGE        | Coefficient:   0.0962


## Alternative formulation of ridge regression

In [6]:
def sparse_ridge_alternative(X,M,k,lamda):
    n, p = X.shape
    beta = cp.Variable(p) 
    q = cp.Variable(p, boolean=True)
    t = cp.Variable(p)

    constraints= []

    for i in range(p):
        constraints.append(beta[i] >= -M*q[i])
        constraints.append(beta[i] <= M*q[i])
        constraints.append(t[i] >= cp.square(beta[i]))

    constraints.append(cp.sum(q) <= k)

    objective = cp.Minimize(cp.sum_squares(y - X @ beta)+lamda*cp.sum(t))
    
    problem = cp.Problem(objective, constraints)
    problem.solve(solver=cp.MOSEK, verbose=False)
    
    print(f"Status: {problem.status}")
    print(f"Selected features: {np.where(q.value > 0.5)[0]}")
    print(f"Binary variables: {q.value}")
## We also need to print the selected features
    selected_indices = np.where(q.value > 0.5)[0]
        
    print(f"\n=== SELECTED FEATURES ===")
    print(f"Selected {len(selected_indices)} out of {p} features:")
    print("-" * 50)
        
    for idx in selected_indices:
        feature_name = feature_names[idx]
        coefficient = beta.value[idx]
        print(f"  {feature_name:<10} | Coefficient: {coefficient:>8.4f}")

sparse_ridge_alternative(X,M,k,lamda)

Status: optimal
Selected features: [ 2  5 11 12]
Binary variables: [0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0.]

=== SELECTED FEATURES ===
Selected 4 out of 18 features:
--------------------------------------------------
  LON        | Coefficient:  -0.2898
  CMEDV      | Coefficient:  -0.4218
  RM         | Coefficient:  -0.8000
  AGE        | Coefficient:   0.0962


In [7]:
import time
t0 = time.time()
sparse_ridge(X,M,k,lamda)
t1 = time.time()
total_time_1 = t1-t0
print(f" The solving time of the original formulation of ridge regression:{total_time_1} ")

t2 = time.time()
sparse_ridge_alternative(X,M,k,lamda)
t3 = time.time()
total_time_2 = t3-t2
print(f" The solving time of the original formulation of ridge regression:{total_time_2} ")

Status: optimal
Selected features: [ 2  5 11 12]
Binary variables: [0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0.]

=== SELECTED FEATURES ===
Selected 4 out of 18 features:
--------------------------------------------------
  LON        | Coefficient:  -0.2898
  CMEDV      | Coefficient:  -0.4218
  RM         | Coefficient:  -0.8000
  AGE        | Coefficient:   0.0962
 The solving time of the original formulation of ridge regression:1.7469868659973145 
Status: optimal
Selected features: [ 2  5 11 12]
Binary variables: [0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0.]

=== SELECTED FEATURES ===
Selected 4 out of 18 features:
--------------------------------------------------
  LON        | Coefficient:  -0.2898
  CMEDV      | Coefficient:  -0.4218
  RM         | Coefficient:  -0.8000
  AGE        | Coefficient:   0.0962
 The solving time of the original formulation of ridge regression:1.714967966079712 
