Python Scripts For Beginners: Data Analysis

Data Analysis

Collection of 100 Python scripts for beginners related for Data Analysis, each designed to help with fundamental tasks and provide useful examples.

100 Python Scripts For Beginners: Data Analysis

1. Load and Display CSV Data

import pandas as pd

def load_csv(file_path):
data = pd.read_csv(file_path)
print(data.head())

load_csv('data.csv')

Loads a CSV file into a DataFrame and displays the first few rows.


2. Summary Statistics

import pandas as pd

def summary_statistics(file_path):
data = pd.read_csv(file_path)
print(data.describe())

summary_statistics('data.csv')

Generates summary statistics for numerical columns in the dataset.


3. Data Cleaning (Removing NaN Values)

import pandas as pd

def clean_data(file_path):
data = pd.read_csv(file_path)
cleaned_data = data.dropna()
print(cleaned_data.head())

clean_data('data.csv')

Removes rows with NaN values from the dataset.


4. Data Filtering

import pandas as pd

def filter_data(file_path, column_name, value):
data = pd.read_csv(file_path)
filtered_data = data[data[column_name] == value]
print(filtered_data.head())

filter_data('data.csv', 'Category', 'A')

Filters rows where a specified column matches a given value.


5. Data Aggregation

import pandas as pd

def aggregate_data(file_path, column_name):
data = pd.read_csv(file_path)
aggregated = data.groupby(column_name).sum()
print(aggregated)

aggregate_data('data.csv', 'Category')

Aggregates data by summing numerical columns, grouped by a specified column.


6. Data Merging

import pandas as pd

def merge_data(file1, file2, key):
data1 = pd.read_csv(file1)
data2 = pd.read_csv(file2)
merged_data = pd.merge(data1, data2, on=key)
print(merged_data.head())

merge_data('data1.csv', 'data2.csv', 'ID')

Merges two datasets on a common key.


7. Data Pivoting

import pandas as pd

def pivot_data(file_path, index, columns, values):
data = pd.read_csv(file_path)
pivoted_data = data.pivot_table(index=index, columns=columns, values=values)
print(pivoted_data.head())

pivot_data('data.csv', 'Date', 'Category', 'Value')

Pivots the DataFrame to create a new table with specified index, columns, and values.


8. Time Series Analysis

import pandas as pd

def time_series_analysis(file_path):
data = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
print(data.resample('M').sum())

time_series_analysis('time_series_data.csv')

Performs monthly aggregation on a time series dataset.


9. Data Visualization (Histogram)

import pandas as pd
import matplotlib.pyplot as plt

def plot_histogram(file_path, column_name):
data = pd.read_csv(file_path)
data[column_name].hist()
plt.title('Histogram')
plt.xlabel(column_name)
plt.ylabel('Frequency')
plt.show()

plot_histogram('data.csv', 'Value')

Plots a histogram of a specified column.


10. Data Visualization (Scatter Plot)

import pandas as pd
import matplotlib.pyplot as plt

def plot_scatter(file_path, x_col, y_col):
data = pd.read_csv(file_path)
plt.scatter(data[x_col], data[y_col])
plt.xlabel(x_col)
plt.ylabel(y_col)
plt.title('Scatter Plot')
plt.show()

plot_scatter('data.csv', 'Value1', 'Value2')

Plots a scatter plot of two columns.


11. Data Visualization (Line Plot)

import pandas as pd
import matplotlib.pyplot as plt

def plot_line(file_path, x_col, y_col):
data = pd.read_csv(file_path)
plt.plot(data[x_col], data[y_col])
plt.xlabel(x_col)
plt.ylabel(y_col)
plt.title('Line Plot')
plt.show()

plot_line('data.csv', 'Date', 'Value')

Plots a line graph of two columns.


12. Correlation Matrix

import pandas as pd

def correlation_matrix(file_path):
data = pd.read_csv(file_path)
correlation = data.corr()
print(correlation)

correlation_matrix('data.csv')

Computes and prints the correlation matrix for numerical columns.


13. Handling Missing Data (Fill NaN)

import pandas as pd

def fill_missing_data(file_path, value):
data = pd.read_csv(file_path)
filled_data = data.fillna(value)
print(filled_data.head())

fill_missing_data('data.csv', 0)

Fills missing values in the dataset with a specified value.


14. Handling Missing Data (Interpolate)

import pandas as pd

def interpolate_missing_data(file_path):
data = pd.read_csv(file_path)
interpolated_data = data.interpolate()
print(interpolated_data.head())

interpolate_missing_data('data.csv')

Interpolates missing values in the dataset.


15. Detect Outliers

import pandas as pd

def detect_outliers(file_path, column_name):
data = pd.read_csv(file_path)
q1 = data[column_name].quantile(0.25)
q3 = data[column_name].quantile(0.75)
iqr = q3 - q1
outliers = data[(data[column_name] < (q1 - 1.5 * iqr)) | (data[column_name] > (q3 + 1.5 * iqr))]
print(outliers)

detect_outliers('data.csv', 'Value')

Detects outliers in a specified column using the IQR method.


16. Apply Function to DataFrame

import pandas as pd

def apply_function(file_path):
data = pd.read_csv(file_path)
data['NewColumn'] = data['Value'].apply(lambda x: x * 2)
print(data.head())

apply_function('data.csv')

Applies a function to a column in the DataFrame.


17. Normalize Data

import pandas as pd
from sklearn.preprocessing import MinMaxScaler

def normalize_data(file_path):
data = pd.read_csv(file_path)
scaler = MinMaxScaler()
data[['Value']] = scaler.fit_transform(data[['Value']])
print(data.head())

normalize_data('data.csv')

Normalizes a column of numerical data to a range between 0 and 1.


18. Standardize Data

import pandas as pd
from sklearn.preprocessing import StandardScaler

def standardize_data(file_path):
data = pd.read_csv(file_path)
scaler = StandardScaler()
data[['Value']] = scaler.fit_transform(data[['Value']])
print(data.head())

standardize_data('data.csv')

Standardizes a column of numerical data to have a mean of 0 and a standard deviation of 1.


19. Detect Duplicate Rows

import pandas as pd

def detect_duplicates(file_path):
data = pd.read_csv(file_path)
duplicates = data[data.duplicated()]
print(duplicates)

detect_duplicates('data.csv')

Detects and prints duplicate rows in the dataset.


20. Drop Duplicate Rows

import pandas as pd

def drop_duplicates(file_path):
data = pd.read_csv(file_path)
data_cleaned = data.drop_duplicates()
print(data_cleaned.head())

drop_duplicates('data.csv')

Removes duplicate rows from the dataset.


21. Extract Unique Values

import pandas as pd

def unique_values(file_path, column_name):
data = pd.read_csv(file_path)
unique_vals = data[column_name].unique()
print(unique_vals)

unique_values('data.csv', 'Category')

Extracts and prints unique values from a specified column.


22. Count Value Occurrences

import pandas as pd

def count_value_occurrences(file_path, column_name):
data = pd.read_csv(file_path)
counts = data[column_name].value_counts()
print(counts)

count_value_occurrences('data.csv', 'Category')

Counts and prints occurrences of each unique value in a specified column.


23. Save DataFrame to CSV

import pandas as pd

def save_to_csv(data, file_path):
data.to_csv(file_path, index=False)
print(f"Data saved to {file_path}")

data = pd.read_csv('data.csv')
save_to_csv(data, 'saved_data.csv')

Saves a DataFrame to a CSV file.


24. Handle Categorical Data

import pandas as pd
from sklearn.preprocessing import LabelEncoder

def handle_categorical_data(file_path):
data = pd.read_csv(file_path)
encoder = LabelEncoder()
data['CategoryEncoded'] = encoder.fit_transform(data['Category'])
print(data.head())

handle_categorical_data('data.csv')

Encodes categorical data into numerical values using LabelEncoder.


25. Apply Rolling Window Analysis

import pandas as pd

def rolling_window_analysis(file_path):
data = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
rolling_mean = data['Value'].rolling(window=7).mean()
print(rolling_mean)

rolling_window_analysis('time_series_data.csv')

Calculates the rolling mean for a time series data.


26. Detect Seasonal Trends

import pandas as pd
import statsmodels.api as sm

def detect_seasonal_trends(file_path):
data = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
decomposition = sm.tsa.seasonal_decompose(data['Value'], model='additive')
decomposition.plot()
plt.show()

detect_seasonal_trends('time_series_data.csv')

Detects and plots seasonal trends in time series data.


27. Basic Descriptive Statistics

import pandas as pd

def descriptive_statistics(file_path):
data = pd.read_csv(file_path)
desc_stats = data.describe(include='all')
print(desc_stats)

descriptive_statistics('data.csv')

Provides basic descriptive statistics for all columns in the dataset.


28. Feature Engineering

import pandas as pd

def feature_engineering(file_path):
data = pd.read_csv(file_path)
data['Value_Squared'] = data['Value'] ** 2
print(data.head())

feature_engineering('data.csv')

Creates a new feature by squaring an existing column.


29. Data Aggregation by Date

import pandas as pd

def aggregate_by_date(file_path):
data = pd.read_csv(file_path, parse_dates=['Date'])
aggregated_data = data.groupby(data['Date'].dt.date).sum()
print(aggregated_data)

aggregate_by_date('time_series_data.csv')

Aggregates data by date, summing numerical values.


30. Visualize Data Distribution

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def visualize_distribution(file_path, column_name):
data = pd.read_csv(file_path)
sns.histplot(data[column_name], kde=True)
plt.title('Data Distribution')
plt.xlabel(column_name)
plt.ylabel('Frequency')
plt.show()

visualize_distribution('data.csv', 'Value')

Visualizes the distribution of a numerical column using a histogram and KDE.


31. Cross-tabulation

import pandas as pd

def cross_tabulation(file_path, row_col, col_col):
data = pd.read_csv(file_path)
crosstab = pd.crosstab(data[row_col], data[col_col])
print(crosstab)

cross_tabulation('data.csv', 'Category', 'SubCategory')

Generates a cross-tabulation of two categorical columns.


32. Calculate Moving Average

import pandas as pd

def moving_average(file_path, window_size):
data = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
moving_avg = data['Value'].rolling(window=window_size).mean()
print(moving_avg)

moving_average('time_series_data.csv', 7)

Calculates the moving average for a specified window size.


33. Data Resampling

import pandas as pd

def resample_data(file_path, frequency):
data = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
resampled_data = data.resample(frequency).mean()
print(resampled_data)

resample_data('time_series_data.csv', 'M')

Resamples time series data to a specified frequency (e.g., monthly).


34. Detect Anomalies

import pandas as pd
import numpy as np

def detect_anomalies(file_path):
data = pd.read_csv(file_path)
mean = data['Value'].mean()
std_dev = data['Value'].std()
anomalies = data[(data['Value'] > mean + 2 * std_dev) | (data['Value'] < mean - 2 * std_dev)]
print(anomalies)

detect_anomalies('data.csv')

Detects anomalies based on deviations from the mean.


35. Encode Dates

import pandas as pd

def encode_dates(file_path):
data = pd.read_csv(file_path, parse_dates=['Date'])
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day
print(data.head())

encode_dates('time_series_data.csv')

Extracts and encodes year, month, and day from date columns.


36. Compute Skewness

import pandas as pd

def compute_skewness(file_path):
data = pd.read_csv(file_path)
skewness = data['Value'].skew()
print(f"Skewness: {skewness}")

compute_skewness('data.csv')

Computes the skewness of a numerical column.


37. Compute Kurtosis

import pandas as pd

def compute_kurtosis(file_path):
data = pd.read_csv(file_path)
kurtosis = data['Value'].kurt()
print(f"Kurtosis: {kurtosis}")

compute_kurtosis('data.csv')

Computes the kurtosis of a numerical column.


38. Data Imputation

import pandas as pd
from sklearn.impute import SimpleImputer

def impute_data(file_path):
data = pd.read_csv(file_path)
imputer = SimpleImputer(strategy='mean')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)
print(data_imputed.head())

impute_data('data.csv')

Imputes missing values in the dataset using the mean of the column.


39. Visualize Correlation Heatmap

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def correlation_heatmap(file_path):
data = pd.read_csv(file_path)
correlation = data.corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

correlation_heatmap('data.csv')

Visualizes the correlation matrix as a heatmap.


40. Feature Selection

import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif

def feature_selection(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
y = data['Target']
selector = SelectKBest(score_func=f_classif, k=3)
fit = selector.fit(X, y)
print(f"Selected Features: {X.columns[selector.get_support()]}")

feature_selection('data.csv')

Selects the top features based on statistical tests.


41. Calculate Variance

import pandas as pd

def calculate_variance(file_path):
data = pd.read_csv(file_path)
variance = data['Value'].var()
print(f"Variance: {variance}")

calculate_variance('data.csv')

Calculates the variance of a numerical column.


42. Calculate Standard Deviation

import pandas as pd

def calculate_std_dev(file_path):
data = pd.read_csv(file_path)
std_dev = data['Value'].std()
print(f"Standard Deviation: {std_dev}")

calculate_std_dev('data.csv')

Calculates the standard deviation of a numerical column. Python Scripts For Beginners


43. Data Frame Shape

import pandas as pd

def data_frame_shape(file_path):
data = pd.read_csv(file_path)
print(f"DataFrame Shape: {data.shape}")

data_frame_shape('data.csv')

Prints the shape (number of rows and columns) of the DataFrame. Python Scripts For Beginners


44. Value Counts with Percentages

import pandas as pd

def value_counts_percentage(file_path, column_name):
data = pd.read_csv(file_path)
counts = data[column_name].value_counts(normalize=True) * 100
print(counts)

value_counts_percentage('data.csv', 'Category')

Calculates and prints the percentage of occurrences of each unique value in a column. Python Scripts For Beginners


45. Aggregate Data by Multiple Columns

import pandas as pd

def aggregate_by_multiple_columns(file_path, columns):
data = pd.read_csv(file_path)
aggregated_data = data.groupby(columns).sum()
print(aggregated_data)

aggregate_by_multiple_columns('data.csv', ['Category', 'SubCategory'])

Aggregates data by multiple columns.


46. Perform Principal Component Analysis (PCA)

import pandas as pd
from sklearn.decomposition import PCA

def perform_pca(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X)
print(principal_components)

perform_pca('data.csv')

Reduces the dimensionality of the data using PCA.

47. Perform K-Means Clustering

import pandas as pd
from sklearn.cluster import KMeans

def perform_kmeans(file_path, n_clusters):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
kmeans = KMeans(n_clusters=n_clusters)
clusters = kmeans.fit_predict(X)
print(clusters)

perform_kmeans('data.csv', 3)

Clusters the data into a specified number of clusters using K-Means.


48. Calculate Covariance Matrix

import pandas as pd

def calculate_covariance(file_path):
data = pd.read_csv(file_path)
covariance_matrix = data.cov()
print(covariance_matrix)

calculate_covariance('data.csv')

Calculates the covariance matrix for numerical columns.


49. Normalize Data with Z-score

import pandas as pd
from scipy.stats import zscore

def zscore_normalize(file_path):
data = pd.read_csv(file_path)
data['Value_ZScore'] = zscore(data['Value'])
print(data.head())

zscore_normalize('data.csv')

Normalizes data using Z-score.

50. Perform Linear Regression

import pandas as pd
from sklearn.linear_model import LinearRegression

def perform_linear_regression(file_path):
data = pd.read_csv(file_path)
X = data[['Feature1']]
y = data['Target']
model = LinearRegression()
model.fit(X, y)
predictions = model.predict(X)
print(predictions)

perform_linear_regression('data.csv')

Performs linear regression on the dataset to predict the target variable based on one feature.


51. Perform Logistic Regression

import pandas as pd
from sklearn.linear_model import LogisticRegression

def perform_logistic_regression(file_path):
data = pd.read_csv(file_path)
X = data[['Feature1']]
y = data['Target']
model = LogisticRegression()
model.fit(X, y)
predictions = model.predict(X)
print(predictions)

perform_logistic_regression('data.csv')

Performs logistic regression for binary classification on the dataset.


52. Perform Decision Tree Classification

import pandas as pd
from sklearn.tree import DecisionTreeClassifier

def decision_tree_classification(file_path):
data = pd.read_csv(file_path)
X = data[['Feature1']]
y = data['Target']
model = DecisionTreeClassifier()
model.fit(X, y)
predictions = model.predict(X)
print(predictions)

decision_tree_classification('data.csv')

Classifies data using a Decision Tree classifier.


53. Perform Random Forest Classification

import pandas as pd
from sklearn.ensemble import RandomForestClassifier

def random_forest_classification(file_path):
data = pd.read_csv(file_path)
X = data[['Feature1']]
y = data['Target']
model = RandomForestClassifier()
model.fit(X, y)
predictions = model.predict(X)
print(predictions)

random_forest_classification('data.csv')

Classifies data using a Random Forest classifier.


54. Perform K-Nearest Neighbors Classification

import pandas as pd
from sklearn.neighbors import KNeighborsClassifier

def knn_classification(file_path):
data = pd.read_csv(file_path)
X = data[['Feature1']]
y = data['Target']
model = KNeighborsClassifier()
model.fit(X, y)
predictions = model.predict(X)
print(predictions)

knn_classification('data.csv')

Classifies data using the K-Nearest Neighbors algorithm.


55. Perform Support Vector Machine Classification

import pandas as pd
from sklearn.svm import SVC

def svm_classification(file_path):
data = pd.read_csv(file_path)
X = data[['Feature1']]
y = data['Target']
model = SVC()
model.fit(X, y)
predictions = model.predict(X)
print(predictions)

svm_classification('data.csv')

Classifies data using a Support Vector Machine.


56. Feature Importance with Random Forest

import pandas as pd
from sklearn.ensemble import RandomForestClassifier

def feature_importance(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
y = data['Target']
model = RandomForestClassifier()
model.fit(X, y)
importances = model.feature_importances_
print(importances)

feature_importance('data.csv')

Determines feature importance using a Random Forest classifier.


57. Principal Component Analysis (PCA) for Dimensionality Reduction

import pandas as pd
from sklearn.decomposition import PCA

def pca_dimensionality_reduction(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X)
print(principal_components)

pca_dimensionality_reduction('data.csv')

Reduces dimensionality of the dataset using PCA.


58. Time Series Decomposition

import pandas as pd
import statsmodels.api as sm

def time_series_decomposition(file_path):
data = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
decomposition = sm.tsa.seasonal_decompose(data['Value'], model='additive')
decomposition.plot()
plt.show()

time_series_decomposition('time_series_data.csv')

Decomposes time series data into trend, seasonality, and residuals.


59. Cross-Validation

import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

def cross_validation(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
y = data['Target']
model = LogisticRegression()
scores = cross_val_score(model, X, y, cv=5)
print(f"Cross-Validation Scores: {scores}")

cross_validation('data.csv')

Performs cross-validation to evaluate the performance of a logistic regression model.


60. Hyperparameter Tuning with Grid Search

import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

def grid_search(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
y = data['Target']
model = RandomForestClassifier()
param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X, y)
print(f"Best Parameters: {grid_search.best_params_}")

grid_search('data.csv')

Tunes hyperparameters of a Random Forest model using Grid Search.


61. Model Evaluation with Confusion Matrix

import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression

def confusion_matrix_evaluation(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
y = data['Target']
model = LogisticRegression()
model.fit(X, y)
predictions = model.predict(X)
matrix = confusion_matrix(y, predictions)
print(matrix)

confusion_matrix_evaluation('data.csv')

Evaluates model performance using a confusion matrix.


62. ROC Curve and AUC

import pandas as pd
from sklearn.metrics import roc_curve, auc
from sklearn.linear_model import LogisticRegression

def roc_curve_auc(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
y = data['Target']
model = LogisticRegression()
model.fit(X, y)
y_prob = model.predict_proba(X)[:,1]
fpr, tpr, _ = roc_curve(y, y_prob)
roc_auc = auc(fpr, tpr)
print(f"AUC: {roc_auc}")

roc_curve_auc('data.csv')

Plots ROC curve and calculates AUC for binary classification.


63. Train-Test Split

import pandas as pd
from sklearn.model_selection import train_test_split

def train_test_split_data(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
y = data['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print(f"Train Shape: {X_train.shape}, Test Shape: {X_test.shape}")

train_test_split_data('data.csv')

Splits the dataset into training and testing sets.


64. Data Augmentation

import pandas as pd
from sklearn.utils import resample

def data_augmentation(file_path):
data = pd.read_csv(file_path)
data_majority = data[data['Target'] == 0]
data_minority = data[data['Target'] == 1]
data_minority_upsampled = resample(data_minority, replace=True, n_samples=len(data_majority))
data_upsampled = pd.concat([data_majority, data_minority_upsampled])
print(data_upsampled['Target'].value_counts())

data_augmentation('data.csv')

Augments data by upsampling the minority class to balance the dataset.


65. Outlier Detection with Isolation Forest

import pandas as pd
from sklearn.ensemble import IsolationForest

def outlier_detection(file_path):
data = pd.read_csv(file_path)
X = data[['Value']]
model = IsolationForest(contamination=0.1)
outliers = model.fit_predict(X)
data['Outlier'] = outliers
print(data[data['Outlier'] == -1])

outlier_detection('data.csv')

Detects outliers using the Isolation Forest algorithm.


66. One-Hot Encoding

import pandas as pd

def one_hot_encoding(file_path):
data = pd.read_csv(file_path)
encoded_data = pd.get_dummies(data, columns=['Category'])
print(encoded_data.head())

one_hot_encoding('data.csv')

Applies one-hot encoding to categorical columns.


67. Feature Scaling with Min-Max Normalization

import pandas as pd
from sklearn.preprocessing import MinMaxScaler

def min_max_scaling(file_path):
data = pd.read_csv(file_path)
scaler = MinMaxScaler()
data[['Value']] = scaler.fit_transform(data[['Value']])
print(data.head())

min_max_scaling('data.csv')

Scales numerical features to a range between 0 and 1.


68. Compute Feature Correlation

import pandas as pd

def feature_correlation(file_path):
data = pd.read_csv(file_path)
correlation = data.corr()
print(correlation)

feature_correlation('data.csv')

Computes correlation between features in the dataset.


69. Perform Hierarchical Clustering

import pandas as pd
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt

def hierarchical_clustering(file_path):
data = pd.read_csv(file_path)
X = data[['Feature1', 'Feature2']]
dendrogram = sch.dendrogram(sch.linkage(X, method='ward'))
plt.show()

hierarchical_clustering('data.csv')

Performs hierarchical clustering and plots the dendrogram.


70. Principal Component Analysis (PCA) Visualization

import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

def pca_visualization(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X)
plt.scatter(principal_components[:,0], principal_components[:,1])
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA Visualization')
plt.show()

pca_visualization('data.csv')

Visualizes PCA results in a 2D scatter plot.


71. Time Series Forecasting with ARIMA

import pandas as pd
import statsmodels.api as sm

def arima_forecasting(file_path):
data = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
model = sm.tsa.ARIMA(data['Value'], order=(5,1,0))
results = model.fit()
forecast = results.forecast(steps=10)
print(forecast)

arima_forecasting('time_series_data.csv')

Forecasts future values in a time series using ARIMA.


72. Data Transformation with Logarithm

import pandas as pd
import numpy as np

def log_transform(file_path):
data = pd.read_csv(file_path)
data['Value_Log'] = np.log1p(data['Value'])
print(data.head())

log_transform('data.csv')

Applies a logarithmic transformation to the numerical column.


73. Plotting Cumulative Sum

import pandas as pd
import matplotlib.pyplot as plt

def plot_cumulative_sum(file_path):
data = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
cumulative_sum = data['Value'].cumsum()
cumulative_sum.plot()
plt.title('Cumulative Sum')
plt.xlabel('Date')
plt.ylabel('Cumulative Sum')
plt.show()

plot_cumulative_sum('time_series_data.csv')

Plots the cumulative sum of a time series data.


74. Seasonal Decomposition of Time Series

import pandas as pd
import statsmodels.api as sm

def seasonal_decomposition(file_path):
data = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
decomposition = sm.tsa.seasonal_decompose(data['Value'], model='multiplicative')
decomposition.plot()
plt.show()

seasonal_decomposition('time_series_data.csv')

Decomposes time series data into seasonal, trend, and residual components.


75. Rolling Statistics

import pandas as pd

def rolling_statistics(file_path, window_size):
data = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
rolling_mean = data['Value'].rolling(window=window_size).mean()
rolling_std = data['Value'].rolling(window=window_size).std()
print("Rolling Mean:\n", rolling_mean)
print("Rolling Std Dev:\n", rolling_std)

rolling_statistics('time_series_data.csv', 7)

Calculates rolling mean and standard deviation for a specified window size.


76. Time Series Autocorrelation

import pandas as pd
import statsmodels.api as sm

def autocorrelation(file_path):
data = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
sm.graphics.tsa.tsa_plot_acf(data['Value'])
plt.show()

autocorrelation('time_series_data.csv')

Plots the autocorrelation function (ACF) of a time series.


77. Perform Gradient Boosting

import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier

def gradient_boosting(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
y = data['Target']
model = GradientBoostingClassifier()
model.fit(X, y)
predictions = model.predict(X)
print(predictions)

gradient_boosting('data.csv')

Applies Gradient Boosting classification to the dataset.


78. Evaluate Model with Classification Report

import pandas as pd
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

def classification_report_evaluation(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
y = data['Target']
model = LogisticRegression()
model.fit(X, y)
predictions = model.predict(X)
report = classification_report(y, predictions)
print(report)

classification_report_evaluation('data.csv')

Generates a classification report to evaluate model performance.


79. Feature Scaling with Standardization

import pandas as pd
from sklearn.preprocessing import StandardScaler

def standardization(file_path):
data = pd.read_csv(file_path)
scaler = StandardScaler()
data[['Value']] = scaler.fit_transform(data[['Value']])
print(data.head())

standardization('data.csv')

Standardizes numerical features to have zero mean and unit variance.


80. Data Transformation with Box-Cox

import pandas as pd
from scipy.stats import boxcox

def boxcox_transformation(file_path):
data = pd.read_csv(file_path)
data['Value_BoxCox'], _ = boxcox(data['Value'] + 1) # Adding 1 to avoid zero values
print(data.head())

boxcox_transformation('data.csv')

Applies Box-Cox transformation to stabilize variance and make data more normal.


81. Dimensionality Reduction with t-SNE

import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

def tsne_dimensionality_reduction(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)
plt.scatter(X_tsne[:,0], X_tsne[:,1])
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.title('t-SNE Visualization')
plt.show()

tsne_dimensionality_reduction('data.csv')

Reduces dimensionality and visualizes the data using t-SNE.


82. Plotting Histogram of a Feature

import pandas as pd
import matplotlib.pyplot as plt

def plot_histogram(file_path, column_name):
data = pd.read_csv(file_path)
plt.hist(data[column_name], bins=30)
plt.title(f'Histogram of {column_name}')
plt.xlabel(column_name)
plt.ylabel('Frequency')
plt.show()

plot_histogram('data.csv', 'Value')

Plots a histogram of a numerical feature to visualize its distribution.


83. Calculate Relative Change

import pandas as pd

def calculate_relative_change(file_path):
data = pd.read_csv(file_path)
data['Relative_Change'] = data['Value'].pct_change()
print(data.head())

calculate_relative_change('data.csv')

Calculates the relative change (percentage change) of a numerical column.


84. Convert Categorical Data to Numerical

import pandas as pd
from sklearn.preprocessing import LabelEncoder

def categorical_to_numerical(file_path):
data = pd.read_csv(file_path)
label_encoder = LabelEncoder()
data['Category_Encoded'] = label_encoder.fit_transform(data['Category'])
print(data.head())

categorical_to_numerical('data.csv')

Converts categorical data to numerical using Label Encoding.


85. Handle Missing Values with Interpolation

import pandas as pd

def handle_missing_values(file_path):
data = pd.read_csv(file_path)
data_interpolated = data.interpolate()
print(data_interpolated.head())

handle_missing_values('data.csv')

Fills missing values using interpolation.

86. Create Bins for Continuous Data

import pandas as pd

def create_bins(file_path):
data = pd.read_csv(file_path)
data['Value_Binned'] = pd.cut(data['Value'], bins=5)
print(data.head())

create_bins('data.csv')

Bines continuous data into discrete intervals.


87. Time Series Resampling

import pandas as pd

def time_series_resampling(file_path, frequency):
data = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
resampled_data = data.resample(frequency).mean()
print(resampled_data.head())

time_series_resampling('time_series_data.csv', 'M')

Resamples time series data to a specified frequency (e.g., monthly).


88. Plot Boxplot

import pandas as pd
import matplotlib.pyplot as plt

def plot_boxplot(file_path, column_name):
data = pd.read_csv(file_path)
plt.boxplot(data[column_name])
plt.title(f'Boxplot of {column_name}')
plt.ylabel(column_name)
plt.show()

plot_boxplot('data.csv', 'Value')

Plots a boxplot to visualize the distribution of a feature.


89. Calculate Rolling Mean and Standard Deviation

import pandas as pd

def rolling_mean_std(file_path, window_size):
data = pd.read_csv(file_path)
rolling_mean = data['Value'].rolling(window=window_size).mean()
rolling_std = data['Value'].rolling(window=window_size).std()
print(f"Rolling Mean:\n{rolling_mean}\n")
print(f"Rolling Std Dev:\n{rolling_std}\n")

rolling_mean_std('data.csv', 10)

Calculates and prints the rolling mean and standard deviation of a feature.


90. Chi-Square Test for Independence

import pandas as pd
from scipy.stats import chi2_contingency

def chi_square_test(file_path):
data = pd.read_csv(file_path)
contingency_table = pd.crosstab(data['Feature1'], data['Feature2'])
chi2, p, dof, expected = chi2_contingency(contingency_table)
print(f"Chi2 Statistic: {chi2}\nP-Value: {p}")

chi_square_test('data.csv')

Performs a Chi-Square test to determine if two categorical variables are independent.


91. Create and Evaluate Decision Tree

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def decision_tree_evaluation(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
y = data['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

decision_tree_evaluation('data.csv')

Builds and evaluates a Decision Tree classifier.


92. Evaluate Model with Precision, Recall, F1-Score

import pandas as pd
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

def precision_recall_f1(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
y = data['Target']
model = LogisticRegression()
model.fit(X, y)
predictions = model.predict(X)
report = classification_report(y, predictions)
print(report)

precision_recall_f1('data.csv')

Evaluates model performance using precision, recall, and F1-score.


93. Generate Synthetic Data

import pandas as pd
from sklearn.datasets import make_classification

def generate_synthetic_data():
X, y = make_classification(n_samples=1000, n_features=10, n_classes=2)
data = pd.DataFrame(X, columns=[f'Feature{i+1}' for i in range(X.shape[1])])
data['Target'] = y
data.to_csv('synthetic_data.csv', index=False)
print("Synthetic data generated and saved to 'synthetic_data.csv'.")

generate_synthetic_data()

Generates synthetic classification data and saves it to a CSV file.


94. Analyze Data Distribution

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def analyze_data_distribution(file_path):
data = pd.read_csv(file_path)
sns.histplot(data['Value'], kde=True)
plt.title('Distribution of Value')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()

analyze_data_distribution('data.csv')

Analyzes and visualizes the distribution of a numerical feature using a histogram with KDE.


95. Plot Heatmap of Correlation Matrix

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def plot_heatmap(file_path):
data = pd.read_csv(file_path)
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

plot_heatmap('data.csv')

Plots a heatmap of the correlation matrix to visualize relationships between features.


96. Feature Engineering: Creating Interaction Features

import pandas as pd

def create_interaction_features(file_path):
data = pd.read_csv(file_path)
data['Feature_Interaction'] = data['Feature1'] * data['Feature2']
print(data.head())

create_interaction_features('data.csv')

Creates new features by interacting (multiplying) existing features.


97. Plot Time Series with Multiple Series

import pandas as pd
import matplotlib.pyplot as plt

def plot_time_series_multiple(file_path):
data = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
data.plot()
plt.title('Multiple Time Series')
plt.xlabel('Date')
plt.ylabel('Value')
plt.show()

plot_time_series_multiple('time_series_data.csv')

Plots multiple time series on the same graph.


98. Calculate Mean Absolute Error (MAE)

import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

def calculate_mae(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
y = data['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
print(f"Mean Absolute Error: {mae}")

calculate_mae('data.csv')

Calculates the Mean Absolute Error of a regression model.


99. Compute and Plot Confusion Matrix

import pandas as pd
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

def plot_confusion_matrix(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
y = data['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
model = LogisticRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
cm = ConfusionMatrixDisplay.from_estimator(model, X_test, y_test)
cm.plot()
plt.title('Confusion Matrix')
plt.show()

plot_confusion_matrix('data.csv')

Computes and plots a confusion matrix for model evaluation.


100. Calculate Feature Pairwise Relationships

import pandas as pd

def pairwise_relationships(file_path):
data = pd.read_csv(file_path)
pairwise_corr = data.corr()
print("Pairwise Correlations:\n", pairwise_corr)

pairwise_relationships('data.csv')

Calculates and displays pairwise relationships between all features in the dataset.

This collection of 100 Python scripts for beginners is designed to enhance your understanding of data analysis with practical examples.

Post Comment