DS prac
1) Data Wrangling, I
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
-----------------------------------------------------
df = pd.read_csv("C:\\Users\\isham\\Downloads\\archive (4)\\train.csv") # Assuming file is in the same directory
df.head()
----------------------------------------------------
df.isnull().sum()
--------------------------------------------------
df.describe()
------------------------------------------------
df.shape
--------------------------------------
df.dtypes
---------------------------------
df['Pclass'] = df['Pclass'].astype('category')
df['Sex'] = df['Sex'].astype('category')
df['Embarked'] = df['Embarked'].astype('category')
------------------------------------------------------
pd.get_dummies(df, columns=['pclass'], drop_first=True).head()
-----------------------------------------------------
2) Data Wrangling II
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
---------------------------------------------------
print(df.isnull().sum())
-------------------------------------------
df['Math_Score'] = df['Math_Score'].fillna(df['Math_Score'].mean())
-----------------------------------------
numeric_cols = ['Math_Score', 'Science_Score', 'English_Score', 'Attendance', 'Study_Hours', 'GPA']
# Plot boxplots
for col in numeric_cols:
sns.boxplot(x=df[col])
plt.title(f'Boxplot for {col}')
plt.show()
---------------------------------------------------------
numeric_cols = ['Math_Score', 'Science_Score', 'English_Score', 'Attendance', 'Study_Hours', 'GPA']
# Plot boxplots
for col in numeric_cols:
sns.boxplot(x=df[col])
plt.title(f'Boxplot for {col}')
plt.show()
-----------------------------------------------------------------
z_scores = np.abs(stats.zscore(df[numeric_cols]))
outliers = (z_scores > 3).any(axis=1)
print("Outliers detected:\n", df[outliers])
------------------------------------------------------
cap = df['Study_Hours'].quantile(0.95)
df['Study_Hours'] = np.where(df['Study_Hours'] > cap, cap, df['Study_Hours'])
----------------------------------------------
sns.histplot(df['Study_Hours'], kde=True)
plt.title("Before Log Transformation")
plt.show()
---------------------------------------------------
df['Log_Study_Hours'] = np.log1p(df['Study_Hours']) # log(1 + x)
------------------------------------------------------------------------
sns.histplot(df['Log_Study_Hours'], kde=True)
plt.title("After Log Transformation")
plt.show()
---------------------------------------------------------------------------
3) Descriptive Statistics - Measures of Central Tendency and variability
import pandas as pd
import numpy as np
data = {
'Person_ID': range(1, 11),
'Age_Group': ['18-25', '18-25', '26-35', '26-35', '26-35', '36-45', '36-45', '46-60', '46-60', '60+'],
'Income': [15000, 18000, 25000, 27000, 26000, 30000, 32000, 28000, 29000, 21000]
}
df = pd.DataFrame(data)
-----------------------------------------------------------------
grouped_stats = df.groupby('Age_Group')['Income'].agg(['mean', 'median', 'min', 'max', 'std']).reset_index()
print(grouped_stats)
---------------------------------------------------------
age_mapping = {'18-25': 1, '26-35': 2, '36-45': 3, '46-60': 4, '60+': 5}
df['Age_Group_Num'] = df['Age_Group'].map(age_mapping)
print(df[['Age_Group', 'Age_Group_Num']])
----------------------------------------
import seaborn as sns
iris = sns.load_dataset('iris')
iris.head()
--------------------------------------
species_grouped = iris.groupby('species').describe()
print(species_grouped)
-----------------------------
for species in iris['species'].unique():
print(f"\nSpecies: {species}")
print(iris[iris['species'] == species].describe())
--------------------------------------------------
4) Data Analytics I
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
--------------------------------------------
df = pd.read_csv('C:\\Users\\isham\\Downloads\\archive (5)\\housing.csv') # Or the correct path to your file
df.head()
---------------------------------------------------
print(df.info())
print(df.describe())
print(df.columns)
-----------------------------------------
print(df.isnull().sum())
--------------------------------
df = df.dropna() # or use fillna()
------------------------------
X = df.drop('MEDV', axis=1) # Independent variables
y = df['MEDV'] # Dependent variable (target)
----------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-------------------------------
model = LinearRegression()
model.fit(X_train, y_train)
-------------------------------------------
y_pred = model.predict(X_test)
# Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)
---------------------------------------
plt.scatter(y_test, y_pred, edgecolors=(0, 0, 0))
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted Prices')
plt.show()
------------------------------------------------------
5) Data Analytics II
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
---------------------------------------------
df = pd.read_csv("D:\\dsp\\Social_Network_Ads.csv")
df.head()
----------------------------------------
X = df[['Age', 'EstimatedSalary']]
y = df['Purchased']
----------------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
---------------------------------
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
-----------------------------------
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)
----------------------------------------
y_pred = classifier.predict(X_test)
--------------------------------
cm = confusion_matrix(y_test, y_pred)
# Extract values
TN, FP, FN, TP = cm.ravel()
# Metrics
accuracy = accuracy_score(y_test, y_pred)
error_rate = 1 - accuracy
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
# Output
print(f"Confusion Matrix:\n{cm}")
print(f"True Positives (TP): {TP}")
print(f"False Positives (FP): {FP}")
print(f"True Negatives (TN): {TN}")
print(f"False Negatives (FN): {FN}")
print(f"Accuracy: {accuracy:.2f}")
print(f"Error Rate: {error_rate:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
----------------------------------------------------------
6) Data Analytics III
from sklearn.datasets import load_iris
import pandas as pd
# Load dataset
iris = load_iris()
# Convert to DataFrame
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['species'] = iris.target
# Optional: Map numeric species to names
df['species'] = df['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
df.head()
--------------------------------------------------------
X = df.drop('species', axis=1)
y = df['species']
----------------------------
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y) # Converts species to 0, 1, 2
-------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
----------------------------------------------
model = GaussianNB()
model.fit(X_train, y_train)
------------------------------------------
y_pred = model.predict(X_test)
-------------------------------------
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)
accuracy = accuracy_score(y_test, y_pred)
error_rate = 1 - accuracy
precision = precision_score(y_test, y_pred, average='macro') # Use 'macro' for multiclass
recall = recall_score(y_test, y_pred, average='macro')
print(f"\nAccuracy: {accuracy:.2f}")
print(f"Error Rate: {error_rate:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
-----------------------------------------------------
7) Text Analytics:
# === Import Required Libraries ===
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
# === Download NLTK Data ===
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
---------------------------------------------------
# === Sample Text ===
sample_doc = "Natural Language Processing enables computers to understand and process human language efficiently."
-----------------------------------------
# === Tokenization ===
tokens = word_tokenize(sample_doc)
print("Tokens:\n", tokens)
------------------------------------------
# === Part-of-Speech Tagging ===
pos_tags = pos_tag(tokens)
print("\nPOS Tags:\n", pos_tags)
------------------------------------
# === Remove Stop Words ===
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("\nAfter Stop Words Removal:\n", filtered_tokens)
---------------------------------------
# === Stemming ===
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print("\nAfter Stemming:\n", stemmed_tokens)
-------------------------------------
# === Lemmatization ===
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("\nAfter Lemmatization:\n", lemmatized_tokens)
------------------------------------
# === TF-IDF on Sample Documents ===
documents = [
"Natural Language Processing enables computers to understand human language.",
"Machine learning and NLP are core parts of AI.",
"TF-IDF is used for information retrieval and text mining."
]
# Vectorize
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)
# Show as DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print("\nTF-IDF Matrix:\n")
print(tfidf_df)
------------------------------------------------------
8) Data Visualization I
# Import libraries
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
# Load the Titanic dataset
titanic = sns.load_dataset('titanic')
# Show first few rows
print(titanic.head())
----------------------------------------
# Plot histogram for 'fare' column
sns.histplot(data=titanic, x='fare', kde=True, bins=30) # kde=True plots a smooth curve
plt.title('Distribution of Ticket Fare')
plt.xlabel('Fare')
plt.ylabel('Number of Passengers')
plt.show()
------------------------------------------
9) Data Visualization II
# Import necessary libraries
import seaborn as sns
import matplotlib.pyplot as plt
# Load the Titanic dataset
titanic = sns.load_dataset('titanic')
# Display first few rows
print(titanic.head())
-------------------------------------------
# Create a boxplot
plt.figure(figsize=(10,6))
sns.boxplot(x='sex', y='age', hue='survived', data=titanic)
# Add title and labels
plt.title('Boxplot of Age vs Gender with Survival Status')
plt.xlabel('Sex')
plt.ylabel('Age')
plt.show()
---------------------------------------------------------------
# Observations from the box plot
# Let's check the basic statistics for 'age' with respect to 'sex' and 'survived' to help in making observations
# Group by sex and survived, and get descriptive statistics for age
age_statistics = titanic.groupby(['sex', 'survived'])['age'].describe()
print(age_statistics)
------------------------------------------------------
10) Data Visualization III
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Load the Iris dataset
iris = sns.load_dataset('iris')
# Display the first few rows
print(iris.head())
----------------------------------------------------------
# List features and their types
# Check dataset info
print(iris.info())
-----------------------------------------------------
# Plot histograms for each numeric feature
# Plot histograms
iris.hist(figsize=(10,8))
plt.suptitle('Histograms of Iris Features')
plt.show()
-----------------------------------------------------------
# Boxplots for each numeric feature
plt.figure(figsize=(12,8))
for i, column in enumerate(iris.columns[:-1], 1):
plt.subplot(2, 2, i)
sns.boxplot(y=iris[column])
plt.title(f'Boxplot of {column}')
plt.tight_layout()
plt.show()
------------------------------------------------------------------
---------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt
# Load the Titanic dataset
titanic = sns.load_dataset("titanic")
# Set up the figure
plt.figure(figsize=(10, 6))
# Create the boxplot
sns.boxplot(x="sex", y="age", hue="survived", data=titanic)
# Customize plot
plt.title("Box Plot of Age by Gender and Survival Status")
plt.xlabel("Sex")
plt.ylabel("Age")
plt.legend(title="Survived", labels=["No", "Yes"])
plt.grid(True)
plt.show()
# Observations and Inferences
print("\n📌 Observations and Inferences:")
print("1. Males who did not survive had a wide age distribution, with a slightly higher median age.")
print("2. Males who survived were younger on average, mostly aged between 20–30.")
print("3. Females who survived were generally older than the males who survived, with a more consistent age range.")
print("4. Few females did not survive, and they spanned a moderate age range.")
print("5. Overall, females had a higher survival rate than males, likely due to evacuation priorities.")
print("6. Children and younger passengers had relatively better survival chances.")
print("7. Older males had the lowest survival rate compared to other groups.")
Comments
Post a Comment