DS prac

1) Data Wrangling, I

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

-----------------------------------------------------

df = pd.read_csv("C:\\Users\\isham\\Downloads\\archive (4)\\train.csv") # Assuming file is in the same directory

df.head()

----------------------------------------------------

df.isnull().sum()

--------------------------------------------------

df.describe()

------------------------------------------------

df.shape

--------------------------------------

df.dtypes

---------------------------------

df['Pclass'] = df['Pclass'].astype('category')

df['Sex'] = df['Sex'].astype('category')

df['Embarked'] = df['Embarked'].astype('category')

------------------------------------------------------

pd.get_dummies(df, columns=['pclass'], drop_first=True).head()

-----------------------------------------------------

2) Data Wrangling II

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

from scipy import stats

---------------------------------------------------

print(df.isnull().sum())

-------------------------------------------

df['Math_Score'] = df['Math_Score'].fillna(df['Math_Score'].mean())

-----------------------------------------

numeric_cols = ['Math_Score', 'Science_Score', 'English_Score', 'Attendance', 'Study_Hours', 'GPA']

# Plot boxplots

for col in numeric_cols:

sns.boxplot(x=df[col])

plt.title(f'Boxplot for {col}')

plt.show()

---------------------------------------------------------

numeric_cols = ['Math_Score', 'Science_Score', 'English_Score', 'Attendance', 'Study_Hours', 'GPA']

# Plot boxplots

for col in numeric_cols:

sns.boxplot(x=df[col])

plt.title(f'Boxplot for {col}')

plt.show()

-----------------------------------------------------------------

z_scores = np.abs(stats.zscore(df[numeric_cols]))

outliers = (z_scores > 3).any(axis=1)

print("Outliers detected:\n", df[outliers])

------------------------------------------------------

cap = df['Study_Hours'].quantile(0.95)

df['Study_Hours'] = np.where(df['Study_Hours'] > cap, cap, df['Study_Hours'])

----------------------------------------------

sns.histplot(df['Study_Hours'], kde=True)

plt.title("Before Log Transformation")

plt.show()

---------------------------------------------------

df['Log_Study_Hours'] = np.log1p(df['Study_Hours']) # log(1 + x)

------------------------------------------------------------------------

sns.histplot(df['Log_Study_Hours'], kde=True)

plt.title("After Log Transformation")

plt.show()

---------------------------------------------------------------------------

3) Descriptive Statistics - Measures of Central Tendency and variability

import pandas as pd

import numpy as np

data = {

'Person_ID': range(1, 11),

'Age_Group': ['18-25', '18-25', '26-35', '26-35', '26-35', '36-45', '36-45', '46-60', '46-60', '60+'],

'Income': [15000, 18000, 25000, 27000, 26000, 30000, 32000, 28000, 29000, 21000]

}

df = pd.DataFrame(data)

-----------------------------------------------------------------

grouped_stats = df.groupby('Age_Group')['Income'].agg(['mean', 'median', 'min', 'max', 'std']).reset_index()

print(grouped_stats)

---------------------------------------------------------

age_mapping = {'18-25': 1, '26-35': 2, '36-45': 3, '46-60': 4, '60+': 5}

df['Age_Group_Num'] = df['Age_Group'].map(age_mapping)

print(df[['Age_Group', 'Age_Group_Num']])

----------------------------------------

import seaborn as sns

iris = sns.load_dataset('iris')

iris.head()

--------------------------------------

species_grouped = iris.groupby('species').describe()

print(species_grouped)

-----------------------------

for species in iris['species'].unique():

print(f"\nSpecies: {species}")

print(iris[iris['species'] == species].describe())

--------------------------------------------------

4) Data Analytics I

import pandas as pd

import numpy as np

import seaborn as sns

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error, r2_score

--------------------------------------------

df = pd.read_csv('C:\\Users\\isham\\Downloads\\archive (5)\\housing.csv') # Or the correct path to your file

df.head()

---------------------------------------------------

print(df.info())

print(df.describe())

print(df.columns)

-----------------------------------------

print(df.isnull().sum())

--------------------------------

df = df.dropna() # or use fillna()

------------------------------

X = df.drop('MEDV', axis=1) # Independent variables

y = df['MEDV'] # Dependent variable (target)

----------------------------------

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

-------------------------------

model = LinearRegression()

model.fit(X_train, y_train)

-------------------------------------------

y_pred = model.predict(X_test)

# Evaluation

mse = mean_squared_error(y_test, y_pred)

r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)

print("R^2 Score:", r2)

---------------------------------------

plt.scatter(y_test, y_pred, edgecolors=(0, 0, 0))

plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)

plt.xlabel('Actual')

plt.ylabel('Predicted')

plt.title('Actual vs Predicted Prices')

plt.show()

------------------------------------------------------

5) Data Analytics II

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

---------------------------------------------

df = pd.read_csv("D:\\dsp\\Social_Network_Ads.csv")

df.head()

----------------------------------------

X = df[['Age', 'EstimatedSalary']]

y = df['Purchased']

----------------------------------------

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

---------------------------------

sc = StandardScaler()

X_train = sc.fit_transform(X_train)

X_test = sc.transform(X_test)

-----------------------------------

classifier = LogisticRegression(random_state=0)

classifier.fit(X_train, y_train)

----------------------------------------

y_pred = classifier.predict(X_test)

--------------------------------

cm = confusion_matrix(y_test, y_pred)

# Extract values

TN, FP, FN, TP = cm.ravel()

# Metrics

accuracy = accuracy_score(y_test, y_pred)

error_rate = 1 - accuracy

precision = precision_score(y_test, y_pred)

recall = recall_score(y_test, y_pred)

# Output

print(f"Confusion Matrix:\n{cm}")

print(f"True Positives (TP): {TP}")

print(f"False Positives (FP): {FP}")

print(f"True Negatives (TN): {TN}")

print(f"False Negatives (FN): {FN}")

print(f"Accuracy: {accuracy:.2f}")

print(f"Error Rate: {error_rate:.2f}")

print(f"Precision: {precision:.2f}")

print(f"Recall: {recall:.2f}")

----------------------------------------------------------

6) Data Analytics III

from sklearn.datasets import load_iris

import pandas as pd

# Load dataset

iris = load_iris()

# Convert to DataFrame

df = pd.DataFrame(data=iris.data, columns=iris.feature_names)

df['species'] = iris.target

# Optional: Map numeric species to names

df['species'] = df['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

df.head()

--------------------------------------------------------

X = df.drop('species', axis=1)

y = df['species']

----------------------------

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y = le.fit_transform(y) # Converts species to 0, 1, 2

-------------------------------------------

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

----------------------------------------------

model = GaussianNB()

model.fit(X_train, y_train)

------------------------------------------

y_pred = model.predict(X_test)

-------------------------------------

cm = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:\n", cm)

accuracy = accuracy_score(y_test, y_pred)

error_rate = 1 - accuracy

precision = precision_score(y_test, y_pred, average='macro') # Use 'macro' for multiclass

recall = recall_score(y_test, y_pred, average='macro')

print(f"\nAccuracy: {accuracy:.2f}")

print(f"Error Rate: {error_rate:.2f}")

print(f"Precision: {precision:.2f}")

print(f"Recall: {recall:.2f}")

-----------------------------------------------------

7) Text Analytics:

# === Import Required Libraries ===

import nltk

import pandas as pd

from nltk.tokenize import word_tokenize

from nltk.corpus import stopwords

from nltk.stem import PorterStemmer, WordNetLemmatizer

from nltk import pos_tag

from sklearn.feature_extraction.text import TfidfVectorizer

# === Download NLTK Data ===

nltk.download('punkt')

nltk.download('stopwords')

nltk.download('wordnet')

nltk.download('averaged_perceptron_tagger')

---------------------------------------------------

# === Sample Text ===

sample_doc = "Natural Language Processing enables computers to understand and process human language efficiently."

-----------------------------------------

# === Tokenization ===

tokens = word_tokenize(sample_doc)

print("Tokens:\n", tokens)

------------------------------------------

# === Part-of-Speech Tagging ===

pos_tags = pos_tag(tokens)

print("\nPOS Tags:\n", pos_tags)

------------------------------------

# === Remove Stop Words ===

stop_words = set(stopwords.words('english'))

filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

print("\nAfter Stop Words Removal:\n", filtered_tokens)

---------------------------------------

# === Stemming ===

stemmer = PorterStemmer()

stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

print("\nAfter Stemming:\n", stemmed_tokens)

-------------------------------------

# === Lemmatization ===

lemmatizer = WordNetLemmatizer()

lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

print("\nAfter Lemmatization:\n", lemmatized_tokens)

------------------------------------

# === TF-IDF on Sample Documents ===

documents = [

"Natural Language Processing enables computers to understand human language.",

"Machine learning and NLP are core parts of AI.",

"TF-IDF is used for information retrieval and text mining."

]

# Vectorize

vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform(documents)

# Show as DataFrame

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

print("\nTF-IDF Matrix:\n")

print(tfidf_df)

------------------------------------------------------

8) Data Visualization I

# Import libraries

import seaborn as sns

import matplotlib.pyplot as plt

import pandas as pd

# Load the Titanic dataset

titanic = sns.load_dataset('titanic')

# Show first few rows

print(titanic.head())

----------------------------------------

# Plot histogram for 'fare' column

sns.histplot(data=titanic, x='fare', kde=True, bins=30) # kde=True plots a smooth curve

plt.title('Distribution of Ticket Fare')

plt.xlabel('Fare')

plt.ylabel('Number of Passengers')

plt.show()

------------------------------------------

9) Data Visualization II

# Import necessary libraries

import seaborn as sns

import matplotlib.pyplot as plt

# Load the Titanic dataset

titanic = sns.load_dataset('titanic')

# Display first few rows

print(titanic.head())

-------------------------------------------

# Create a boxplot

plt.figure(figsize=(10,6))

sns.boxplot(x='sex', y='age', hue='survived', data=titanic)

# Add title and labels

plt.title('Boxplot of Age vs Gender with Survival Status')

plt.xlabel('Sex')

plt.ylabel('Age')

plt.show()

---------------------------------------------------------------

# Observations from the box plot

# Let's check the basic statistics for 'age' with respect to 'sex' and 'survived' to help in making observations

# Group by sex and survived, and get descriptive statistics for age

age_statistics = titanic.groupby(['sex', 'survived'])['age'].describe()

print(age_statistics)

------------------------------------------------------

10) Data Visualization III

import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt

# Load the Iris dataset

iris = sns.load_dataset('iris')

# Display the first few rows

print(iris.head())

----------------------------------------------------------

# List features and their types

# Check dataset info

print(iris.info())

-----------------------------------------------------

# Plot histograms for each numeric feature

# Plot histograms

iris.hist(figsize=(10,8))

plt.suptitle('Histograms of Iris Features')

plt.show()

-----------------------------------------------------------

# Boxplots for each numeric feature

plt.figure(figsize=(12,8))

for i, column in enumerate(iris.columns[:-1], 1):

plt.subplot(2, 2, i)

sns.boxplot(y=iris[column])

plt.title(f'Boxplot of {column}')

plt.tight_layout()

plt.show()

------------------------------------------------------------------

---------------------------------------------------

import seaborn as sns

import matplotlib.pyplot as plt

# Load the Titanic dataset

titanic = sns.load_dataset("titanic")

# Set up the figure

plt.figure(figsize=(10, 6))

# Create the boxplot

sns.boxplot(x="sex", y="age", hue="survived", data=titanic)

# Customize plot

plt.title("Box Plot of Age by Gender and Survival Status")

plt.xlabel("Sex")

plt.ylabel("Age")

plt.legend(title="Survived", labels=["No", "Yes"])

plt.grid(True)

plt.show()

# Observations and Inferences

print("\n📌 Observations and Inferences:")

print("1. Males who did not survive had a wide age distribution, with a slightly higher median age.")

print("2. Males who survived were younger on average, mostly aged between 20–30.")

print("3. Females who survived were generally older than the males who survived, with a more consistent age range.")

print("4. Few females did not survive, and they spanned a moderate age range.")

print("5. Overall, females had a higher survival rate than males, likely due to evacuation priorities.")

print("6. Children and younger passengers had relatively better survival chances.")

print("7. Older males had the lowest survival rate compared to other groups.")

Search This Blog

isha

DS prac

Comments

Post a Comment

Popular posts from this blog

ann