Python Scripts For Beginners: Data Analysis
Collection of 100 Python scripts for beginners related for Data Analysis, each designed to help with fundamental tasks and provide useful examples.
100 Python Scripts For Beginners: Data Analysis
1. Load and Display CSV Data
import pandas as pd
def load_csv(file_path):
data = pd.read_csv(file_path)
print(data.head())
load_csv('data.csv')
Loads a CSV file into a DataFrame and displays the first few rows.
2. Summary Statistics
import pandas as pd
def summary_statistics(file_path):
data = pd.read_csv(file_path)
print(data.describe())
summary_statistics('data.csv')
Generates summary statistics for numerical columns in the dataset.
3. Data Cleaning (Removing NaN Values)
import pandas as pd
def clean_data(file_path):
data = pd.read_csv(file_path)
cleaned_data = data.dropna()
print(cleaned_data.head())
clean_data('data.csv')
Removes rows with NaN values from the dataset.
4. Data Filtering
import pandas as pd
def filter_data(file_path, column_name, value):
data = pd.read_csv(file_path)
filtered_data = data[data[column_name] == value]
print(filtered_data.head())
filter_data('data.csv', 'Category', 'A')
Filters rows where a specified column matches a given value.
5. Data Aggregation
import pandas as pd
def aggregate_data(file_path, column_name):
data = pd.read_csv(file_path)
aggregated = data.groupby(column_name).sum()
print(aggregated)
aggregate_data('data.csv', 'Category')
Aggregates data by summing numerical columns, grouped by a specified column.
6. Data Merging
import pandas as pd
def merge_data(file1, file2, key):
data1 = pd.read_csv(file1)
data2 = pd.read_csv(file2)
merged_data = pd.merge(data1, data2, on=key)
print(merged_data.head())
merge_data('data1.csv', 'data2.csv', 'ID')
Merges two datasets on a common key.
7. Data Pivoting
import pandas as pd
def pivot_data(file_path, index, columns, values):
data = pd.read_csv(file_path)
pivoted_data = data.pivot_table(index=index, columns=columns, values=values)
print(pivoted_data.head())
pivot_data('data.csv', 'Date', 'Category', 'Value')
Pivots the DataFrame to create a new table with specified index, columns, and values.
8. Time Series Analysis
import pandas as pd
def time_series_analysis(file_path):
data = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
print(data.resample('M').sum())
time_series_analysis('time_series_data.csv')
Performs monthly aggregation on a time series dataset.
9. Data Visualization (Histogram)
import pandas as pd
import matplotlib.pyplot as plt
def plot_histogram(file_path, column_name):
data = pd.read_csv(file_path)
data[column_name].hist()
plt.title('Histogram')
plt.xlabel(column_name)
plt.ylabel('Frequency')
plt.show()
plot_histogram('data.csv', 'Value')
Plots a histogram of a specified column.
10. Data Visualization (Scatter Plot)
import pandas as pd
import matplotlib.pyplot as plt
def plot_scatter(file_path, x_col, y_col):
data = pd.read_csv(file_path)
plt.scatter(data[x_col], data[y_col])
plt.xlabel(x_col)
plt.ylabel(y_col)
plt.title('Scatter Plot')
plt.show()
plot_scatter('data.csv', 'Value1', 'Value2')
Plots a scatter plot of two columns.
11. Data Visualization (Line Plot)
import pandas as pd
import matplotlib.pyplot as plt
def plot_line(file_path, x_col, y_col):
data = pd.read_csv(file_path)
plt.plot(data[x_col], data[y_col])
plt.xlabel(x_col)
plt.ylabel(y_col)
plt.title('Line Plot')
plt.show()
plot_line('data.csv', 'Date', 'Value')
Plots a line graph of two columns.
12. Correlation Matrix
import pandas as pd
def correlation_matrix(file_path):
data = pd.read_csv(file_path)
correlation = data.corr()
print(correlation)
correlation_matrix('data.csv')
Computes and prints the correlation matrix for numerical columns.
13. Handling Missing Data (Fill NaN)
import pandas as pd
def fill_missing_data(file_path, value):
data = pd.read_csv(file_path)
filled_data = data.fillna(value)
print(filled_data.head())
fill_missing_data('data.csv', 0)
Fills missing values in the dataset with a specified value.
14. Handling Missing Data (Interpolate)
import pandas as pd
def interpolate_missing_data(file_path):
data = pd.read_csv(file_path)
interpolated_data = data.interpolate()
print(interpolated_data.head())
interpolate_missing_data('data.csv')
Interpolates missing values in the dataset.
15. Detect Outliers
import pandas as pd
def detect_outliers(file_path, column_name):
data = pd.read_csv(file_path)
q1 = data[column_name].quantile(0.25)
q3 = data[column_name].quantile(0.75)
iqr = q3 - q1
outliers = data[(data[column_name] < (q1 - 1.5 * iqr)) | (data[column_name] > (q3 + 1.5 * iqr))]
print(outliers)
detect_outliers('data.csv', 'Value')
Detects outliers in a specified column using the IQR method.
16. Apply Function to DataFrame
import pandas as pd
def apply_function(file_path):
data = pd.read_csv(file_path)
data['NewColumn'] = data['Value'].apply(lambda x: x * 2)
print(data.head())
apply_function('data.csv')
Applies a function to a column in the DataFrame.
17. Normalize Data
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
def normalize_data(file_path):
data = pd.read_csv(file_path)
scaler = MinMaxScaler()
data[['Value']] = scaler.fit_transform(data[['Value']])
print(data.head())
normalize_data('data.csv')
Normalizes a column of numerical data to a range between 0 and 1.
18. Standardize Data
import pandas as pd
from sklearn.preprocessing import StandardScaler
def standardize_data(file_path):
data = pd.read_csv(file_path)
scaler = StandardScaler()
data[['Value']] = scaler.fit_transform(data[['Value']])
print(data.head())
standardize_data('data.csv')
Standardizes a column of numerical data to have a mean of 0 and a standard deviation of 1.
19. Detect Duplicate Rows
import pandas as pd
def detect_duplicates(file_path):
data = pd.read_csv(file_path)
duplicates = data[data.duplicated()]
print(duplicates)
detect_duplicates('data.csv')
Detects and prints duplicate rows in the dataset.
20. Drop Duplicate Rows
import pandas as pd
def drop_duplicates(file_path):
data = pd.read_csv(file_path)
data_cleaned = data.drop_duplicates()
print(data_cleaned.head())
drop_duplicates('data.csv')
Removes duplicate rows from the dataset.
21. Extract Unique Values
import pandas as pd
def unique_values(file_path, column_name):
data = pd.read_csv(file_path)
unique_vals = data[column_name].unique()
print(unique_vals)
unique_values('data.csv', 'Category')
Extracts and prints unique values from a specified column.
22. Count Value Occurrences
import pandas as pd
def count_value_occurrences(file_path, column_name):
data = pd.read_csv(file_path)
counts = data[column_name].value_counts()
print(counts)
count_value_occurrences('data.csv', 'Category')
Counts and prints occurrences of each unique value in a specified column.
23. Save DataFrame to CSV
import pandas as pd
def save_to_csv(data, file_path):
data.to_csv(file_path, index=False)
print(f"Data saved to {file_path}")
data = pd.read_csv('data.csv')
save_to_csv(data, 'saved_data.csv')
Saves a DataFrame to a CSV file.
24. Handle Categorical Data
import pandas as pd
from sklearn.preprocessing import LabelEncoder
def handle_categorical_data(file_path):
data = pd.read_csv(file_path)
encoder = LabelEncoder()
data['CategoryEncoded'] = encoder.fit_transform(data['Category'])
print(data.head())
handle_categorical_data('data.csv')
Encodes categorical data into numerical values using LabelEncoder
.
25. Apply Rolling Window Analysis
import pandas as pd
def rolling_window_analysis(file_path):
data = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
rolling_mean = data['Value'].rolling(window=7).mean()
print(rolling_mean)
rolling_window_analysis('time_series_data.csv')
Calculates the rolling mean for a time series data.
26. Detect Seasonal Trends
import pandas as pd
import statsmodels.api as sm
def detect_seasonal_trends(file_path):
data = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
decomposition = sm.tsa.seasonal_decompose(data['Value'], model='additive')
decomposition.plot()
plt.show()
detect_seasonal_trends('time_series_data.csv')
Detects and plots seasonal trends in time series data.
27. Basic Descriptive Statistics
import pandas as pd
def descriptive_statistics(file_path):
data = pd.read_csv(file_path)
desc_stats = data.describe(include='all')
print(desc_stats)
descriptive_statistics('data.csv')
Provides basic descriptive statistics for all columns in the dataset.
28. Feature Engineering
import pandas as pd
def feature_engineering(file_path):
data = pd.read_csv(file_path)
data['Value_Squared'] = data['Value'] ** 2
print(data.head())
feature_engineering('data.csv')
Creates a new feature by squaring an existing column.
29. Data Aggregation by Date
import pandas as pd
def aggregate_by_date(file_path):
data = pd.read_csv(file_path, parse_dates=['Date'])
aggregated_data = data.groupby(data['Date'].dt.date).sum()
print(aggregated_data)
aggregate_by_date('time_series_data.csv')
Aggregates data by date, summing numerical values.
30. Visualize Data Distribution
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
def visualize_distribution(file_path, column_name):
data = pd.read_csv(file_path)
sns.histplot(data[column_name], kde=True)
plt.title('Data Distribution')
plt.xlabel(column_name)
plt.ylabel('Frequency')
plt.show()
visualize_distribution('data.csv', 'Value')
Visualizes the distribution of a numerical column using a histogram and KDE.
31. Cross-tabulation
import pandas as pd
def cross_tabulation(file_path, row_col, col_col):
data = pd.read_csv(file_path)
crosstab = pd.crosstab(data[row_col], data[col_col])
print(crosstab)
cross_tabulation('data.csv', 'Category', 'SubCategory')
Generates a cross-tabulation of two categorical columns.
32. Calculate Moving Average
import pandas as pd
def moving_average(file_path, window_size):
data = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
moving_avg = data['Value'].rolling(window=window_size).mean()
print(moving_avg)
moving_average('time_series_data.csv', 7)
Calculates the moving average for a specified window size.
33. Data Resampling
import pandas as pd
def resample_data(file_path, frequency):
data = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
resampled_data = data.resample(frequency).mean()
print(resampled_data)
resample_data('time_series_data.csv', 'M')
Resamples time series data to a specified frequency (e.g., monthly).
34. Detect Anomalies
import pandas as pd
import numpy as np
def detect_anomalies(file_path):
data = pd.read_csv(file_path)
mean = data['Value'].mean()
std_dev = data['Value'].std()
anomalies = data[(data['Value'] > mean + 2 * std_dev) | (data['Value'] < mean - 2 * std_dev)]
print(anomalies)
detect_anomalies('data.csv')
Detects anomalies based on deviations from the mean.
35. Encode Dates
import pandas as pd
def encode_dates(file_path):
data = pd.read_csv(file_path, parse_dates=['Date'])
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day
print(data.head())
encode_dates('time_series_data.csv')
Extracts and encodes year, month, and day from date columns.
36. Compute Skewness
import pandas as pd
def compute_skewness(file_path):
data = pd.read_csv(file_path)
skewness = data['Value'].skew()
print(f"Skewness: {skewness}")
compute_skewness('data.csv')
Computes the skewness of a numerical column.
37. Compute Kurtosis
import pandas as pd
def compute_kurtosis(file_path):
data = pd.read_csv(file_path)
kurtosis = data['Value'].kurt()
print(f"Kurtosis: {kurtosis}")
compute_kurtosis('data.csv')
Computes the kurtosis of a numerical column.
38. Data Imputation
import pandas as pd
from sklearn.impute import SimpleImputer
def impute_data(file_path):
data = pd.read_csv(file_path)
imputer = SimpleImputer(strategy='mean')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)
print(data_imputed.head())
impute_data('data.csv')
Imputes missing values in the dataset using the mean of the column.
39. Visualize Correlation Heatmap
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
def correlation_heatmap(file_path):
data = pd.read_csv(file_path)
correlation = data.corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()
correlation_heatmap('data.csv')
Visualizes the correlation matrix as a heatmap.
40. Feature Selection
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif
def feature_selection(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
y = data['Target']
selector = SelectKBest(score_func=f_classif, k=3)
fit = selector.fit(X, y)
print(f"Selected Features: {X.columns[selector.get_support()]}")
feature_selection('data.csv')
Selects the top features based on statistical tests.
41. Calculate Variance
import pandas as pd
def calculate_variance(file_path):
data = pd.read_csv(file_path)
variance = data['Value'].var()
print(f"Variance: {variance}")
calculate_variance('data.csv')
Calculates the variance of a numerical column.
42. Calculate Standard Deviation
import pandas as pd
def calculate_std_dev(file_path):
data = pd.read_csv(file_path)
std_dev = data['Value'].std()
print(f"Standard Deviation: {std_dev}")
calculate_std_dev('data.csv')
Calculates the standard deviation of a numerical column. Python Scripts For Beginners
43. Data Frame Shape
import pandas as pd
def data_frame_shape(file_path):
data = pd.read_csv(file_path)
print(f"DataFrame Shape: {data.shape}")
data_frame_shape('data.csv')
Prints the shape (number of rows and columns) of the DataFrame. Python Scripts For Beginners
44. Value Counts with Percentages
import pandas as pd
def value_counts_percentage(file_path, column_name):
data = pd.read_csv(file_path)
counts = data[column_name].value_counts(normalize=True) * 100
print(counts)
value_counts_percentage('data.csv', 'Category')
Calculates and prints the percentage of occurrences of each unique value in a column. Python Scripts For Beginners
45. Aggregate Data by Multiple Columns
import pandas as pd
def aggregate_by_multiple_columns(file_path, columns):
data = pd.read_csv(file_path)
aggregated_data = data.groupby(columns).sum()
print(aggregated_data)
aggregate_by_multiple_columns('data.csv', ['Category', 'SubCategory'])
Aggregates data by multiple columns.
46. Perform Principal Component Analysis (PCA)
import pandas as pd
from sklearn.decomposition import PCA
def perform_pca(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X)
print(principal_components)
perform_pca('data.csv')
Reduces the dimensionality of the data using PCA.
47. Perform K-Means Clustering
import pandas as pd
from sklearn.cluster import KMeans
def perform_kmeans(file_path, n_clusters):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
kmeans = KMeans(n_clusters=n_clusters)
clusters = kmeans.fit_predict(X)
print(clusters)
perform_kmeans('data.csv', 3)
Clusters the data into a specified number of clusters using K-Means.
48. Calculate Covariance Matrix
import pandas as pd
def calculate_covariance(file_path):
data = pd.read_csv(file_path)
covariance_matrix = data.cov()
print(covariance_matrix)
calculate_covariance('data.csv')
Calculates the covariance matrix for numerical columns.
49. Normalize Data with Z-score
import pandas as pd
from scipy.stats import zscore
def zscore_normalize(file_path):
data = pd.read_csv(file_path)
data['Value_ZScore'] = zscore(data['Value'])
print(data.head())
zscore_normalize('data.csv')
Normalizes data using Z-score.
50. Perform Linear Regression
import pandas as pd
from sklearn.linear_model import LinearRegression
def perform_linear_regression(file_path):
data = pd.read_csv(file_path)
X = data[['Feature1']]
y = data['Target']
model = LinearRegression()
model.fit(X, y)
predictions = model.predict(X)
print(predictions)
perform_linear_regression('data.csv')
Performs linear regression on the dataset to predict the target variable based on one feature.
51. Perform Logistic Regression
import pandas as pd
from sklearn.linear_model import LogisticRegression
def perform_logistic_regression(file_path):
data = pd.read_csv(file_path)
X = data[['Feature1']]
y = data['Target']
model = LogisticRegression()
model.fit(X, y)
predictions = model.predict(X)
print(predictions)
perform_logistic_regression('data.csv')
Performs logistic regression for binary classification on the dataset.
52. Perform Decision Tree Classification
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
def decision_tree_classification(file_path):
data = pd.read_csv(file_path)
X = data[['Feature1']]
y = data['Target']
model = DecisionTreeClassifier()
model.fit(X, y)
predictions = model.predict(X)
print(predictions)
decision_tree_classification('data.csv')
Classifies data using a Decision Tree classifier.
53. Perform Random Forest Classification
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
def random_forest_classification(file_path):
data = pd.read_csv(file_path)
X = data[['Feature1']]
y = data['Target']
model = RandomForestClassifier()
model.fit(X, y)
predictions = model.predict(X)
print(predictions)
random_forest_classification('data.csv')
Classifies data using a Random Forest classifier.
54. Perform K-Nearest Neighbors Classification
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
def knn_classification(file_path):
data = pd.read_csv(file_path)
X = data[['Feature1']]
y = data['Target']
model = KNeighborsClassifier()
model.fit(X, y)
predictions = model.predict(X)
print(predictions)
knn_classification('data.csv')
Classifies data using the K-Nearest Neighbors algorithm.
55. Perform Support Vector Machine Classification
import pandas as pd
from sklearn.svm import SVC
def svm_classification(file_path):
data = pd.read_csv(file_path)
X = data[['Feature1']]
y = data['Target']
model = SVC()
model.fit(X, y)
predictions = model.predict(X)
print(predictions)
svm_classification('data.csv')
Classifies data using a Support Vector Machine.
56. Feature Importance with Random Forest
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
def feature_importance(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
y = data['Target']
model = RandomForestClassifier()
model.fit(X, y)
importances = model.feature_importances_
print(importances)
feature_importance('data.csv')
Determines feature importance using a Random Forest classifier.
57. Principal Component Analysis (PCA) for Dimensionality Reduction
import pandas as pd
from sklearn.decomposition import PCA
def pca_dimensionality_reduction(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X)
print(principal_components)
pca_dimensionality_reduction('data.csv')
Reduces dimensionality of the dataset using PCA.
58. Time Series Decomposition
import pandas as pd
import statsmodels.api as sm
def time_series_decomposition(file_path):
data = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
decomposition = sm.tsa.seasonal_decompose(data['Value'], model='additive')
decomposition.plot()
plt.show()
time_series_decomposition('time_series_data.csv')
Decomposes time series data into trend, seasonality, and residuals.
59. Cross-Validation
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
def cross_validation(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
y = data['Target']
model = LogisticRegression()
scores = cross_val_score(model, X, y, cv=5)
print(f"Cross-Validation Scores: {scores}")
cross_validation('data.csv')
Performs cross-validation to evaluate the performance of a logistic regression model.
60. Hyperparameter Tuning with Grid Search
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
def grid_search(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
y = data['Target']
model = RandomForestClassifier()
param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X, y)
print(f"Best Parameters: {grid_search.best_params_}")
grid_search('data.csv')
Tunes hyperparameters of a Random Forest model using Grid Search.
61. Model Evaluation with Confusion Matrix
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
def confusion_matrix_evaluation(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
y = data['Target']
model = LogisticRegression()
model.fit(X, y)
predictions = model.predict(X)
matrix = confusion_matrix(y, predictions)
print(matrix)
confusion_matrix_evaluation('data.csv')
Evaluates model performance using a confusion matrix.
62. ROC Curve and AUC
import pandas as pd
from sklearn.metrics import roc_curve, auc
from sklearn.linear_model import LogisticRegression
def roc_curve_auc(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
y = data['Target']
model = LogisticRegression()
model.fit(X, y)
y_prob = model.predict_proba(X)[:,1]
fpr, tpr, _ = roc_curve(y, y_prob)
roc_auc = auc(fpr, tpr)
print(f"AUC: {roc_auc}")
roc_curve_auc('data.csv')
Plots ROC curve and calculates AUC for binary classification.
63. Train-Test Split
import pandas as pd
from sklearn.model_selection import train_test_split
def train_test_split_data(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
y = data['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print(f"Train Shape: {X_train.shape}, Test Shape: {X_test.shape}")
train_test_split_data('data.csv')
Splits the dataset into training and testing sets.
64. Data Augmentation
import pandas as pd
from sklearn.utils import resample
def data_augmentation(file_path):
data = pd.read_csv(file_path)
data_majority = data[data['Target'] == 0]
data_minority = data[data['Target'] == 1]
data_minority_upsampled = resample(data_minority, replace=True, n_samples=len(data_majority))
data_upsampled = pd.concat([data_majority, data_minority_upsampled])
print(data_upsampled['Target'].value_counts())
data_augmentation('data.csv')
Augments data by upsampling the minority class to balance the dataset.
65. Outlier Detection with Isolation Forest
import pandas as pd
from sklearn.ensemble import IsolationForest
def outlier_detection(file_path):
data = pd.read_csv(file_path)
X = data[['Value']]
model = IsolationForest(contamination=0.1)
outliers = model.fit_predict(X)
data['Outlier'] = outliers
print(data[data['Outlier'] == -1])
outlier_detection('data.csv')
Detects outliers using the Isolation Forest algorithm.
66. One-Hot Encoding
import pandas as pd
def one_hot_encoding(file_path):
data = pd.read_csv(file_path)
encoded_data = pd.get_dummies(data, columns=['Category'])
print(encoded_data.head())
one_hot_encoding('data.csv')
Applies one-hot encoding to categorical columns.
67. Feature Scaling with Min-Max Normalization
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
def min_max_scaling(file_path):
data = pd.read_csv(file_path)
scaler = MinMaxScaler()
data[['Value']] = scaler.fit_transform(data[['Value']])
print(data.head())
min_max_scaling('data.csv')
Scales numerical features to a range between 0 and 1.
68. Compute Feature Correlation
import pandas as pd
def feature_correlation(file_path):
data = pd.read_csv(file_path)
correlation = data.corr()
print(correlation)
feature_correlation('data.csv')
Computes correlation between features in the dataset.
69. Perform Hierarchical Clustering
import pandas as pd
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt
def hierarchical_clustering(file_path):
data = pd.read_csv(file_path)
X = data[['Feature1', 'Feature2']]
dendrogram = sch.dendrogram(sch.linkage(X, method='ward'))
plt.show()
hierarchical_clustering('data.csv')
Performs hierarchical clustering and plots the dendrogram.
70. Principal Component Analysis (PCA) Visualization
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
def pca_visualization(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X)
plt.scatter(principal_components[:,0], principal_components[:,1])
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA Visualization')
plt.show()
pca_visualization('data.csv')
Visualizes PCA results in a 2D scatter plot.
71. Time Series Forecasting with ARIMA
import pandas as pd
import statsmodels.api as sm
def arima_forecasting(file_path):
data = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
model = sm.tsa.ARIMA(data['Value'], order=(5,1,0))
results = model.fit()
forecast = results.forecast(steps=10)
print(forecast)
arima_forecasting('time_series_data.csv')
Forecasts future values in a time series using ARIMA.
72. Data Transformation with Logarithm
import pandas as pd
import numpy as np
def log_transform(file_path):
data = pd.read_csv(file_path)
data['Value_Log'] = np.log1p(data['Value'])
print(data.head())
log_transform('data.csv')
Applies a logarithmic transformation to the numerical column.
73. Plotting Cumulative Sum
import pandas as pd
import matplotlib.pyplot as plt
def plot_cumulative_sum(file_path):
data = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
cumulative_sum = data['Value'].cumsum()
cumulative_sum.plot()
plt.title('Cumulative Sum')
plt.xlabel('Date')
plt.ylabel('Cumulative Sum')
plt.show()
plot_cumulative_sum('time_series_data.csv')
Plots the cumulative sum of a time series data.
74. Seasonal Decomposition of Time Series
import pandas as pd
import statsmodels.api as sm
def seasonal_decomposition(file_path):
data = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
decomposition = sm.tsa.seasonal_decompose(data['Value'], model='multiplicative')
decomposition.plot()
plt.show()
seasonal_decomposition('time_series_data.csv')
Decomposes time series data into seasonal, trend, and residual components.
75. Rolling Statistics
import pandas as pd
def rolling_statistics(file_path, window_size):
data = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
rolling_mean = data['Value'].rolling(window=window_size).mean()
rolling_std = data['Value'].rolling(window=window_size).std()
print("Rolling Mean:\n", rolling_mean)
print("Rolling Std Dev:\n", rolling_std)
rolling_statistics('time_series_data.csv', 7)
Calculates rolling mean and standard deviation for a specified window size.
76. Time Series Autocorrelation
import pandas as pd
import statsmodels.api as sm
def autocorrelation(file_path):
data = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
sm.graphics.tsa.tsa_plot_acf(data['Value'])
plt.show()
autocorrelation('time_series_data.csv')
Plots the autocorrelation function (ACF) of a time series.
77. Perform Gradient Boosting
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
def gradient_boosting(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
y = data['Target']
model = GradientBoostingClassifier()
model.fit(X, y)
predictions = model.predict(X)
print(predictions)
gradient_boosting('data.csv')
Applies Gradient Boosting classification to the dataset.
78. Evaluate Model with Classification Report
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
def classification_report_evaluation(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
y = data['Target']
model = LogisticRegression()
model.fit(X, y)
predictions = model.predict(X)
report = classification_report(y, predictions)
print(report)
classification_report_evaluation('data.csv')
Generates a classification report to evaluate model performance.
79. Feature Scaling with Standardization
import pandas as pd
from sklearn.preprocessing import StandardScaler
def standardization(file_path):
data = pd.read_csv(file_path)
scaler = StandardScaler()
data[['Value']] = scaler.fit_transform(data[['Value']])
print(data.head())
standardization('data.csv')
Standardizes numerical features to have zero mean and unit variance.
80. Data Transformation with Box-Cox
import pandas as pd
from scipy.stats import boxcox
def boxcox_transformation(file_path):
data = pd.read_csv(file_path)
data['Value_BoxCox'], _ = boxcox(data['Value'] + 1) # Adding 1 to avoid zero values
print(data.head())
boxcox_transformation('data.csv')
Applies Box-Cox transformation to stabilize variance and make data more normal.
81. Dimensionality Reduction with t-SNE
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
def tsne_dimensionality_reduction(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)
plt.scatter(X_tsne[:,0], X_tsne[:,1])
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.title('t-SNE Visualization')
plt.show()
tsne_dimensionality_reduction('data.csv')
Reduces dimensionality and visualizes the data using t-SNE.
82. Plotting Histogram of a Feature
import pandas as pd
import matplotlib.pyplot as plt
def plot_histogram(file_path, column_name):
data = pd.read_csv(file_path)
plt.hist(data[column_name], bins=30)
plt.title(f'Histogram of {column_name}')
plt.xlabel(column_name)
plt.ylabel('Frequency')
plt.show()
plot_histogram('data.csv', 'Value')
Plots a histogram of a numerical feature to visualize its distribution.
83. Calculate Relative Change
import pandas as pd
def calculate_relative_change(file_path):
data = pd.read_csv(file_path)
data['Relative_Change'] = data['Value'].pct_change()
print(data.head())
calculate_relative_change('data.csv')
Calculates the relative change (percentage change) of a numerical column.
84. Convert Categorical Data to Numerical
import pandas as pd
from sklearn.preprocessing import LabelEncoder
def categorical_to_numerical(file_path):
data = pd.read_csv(file_path)
label_encoder = LabelEncoder()
data['Category_Encoded'] = label_encoder.fit_transform(data['Category'])
print(data.head())
categorical_to_numerical('data.csv')
Converts categorical data to numerical using Label Encoding.
85. Handle Missing Values with Interpolation
import pandas as pd
def handle_missing_values(file_path):
data = pd.read_csv(file_path)
data_interpolated = data.interpolate()
print(data_interpolated.head())
handle_missing_values('data.csv')
Fills missing values using interpolation.
86. Create Bins for Continuous Data
import pandas as pd
def create_bins(file_path):
data = pd.read_csv(file_path)
data['Value_Binned'] = pd.cut(data['Value'], bins=5)
print(data.head())
create_bins('data.csv')
Bines continuous data into discrete intervals.
87. Time Series Resampling
import pandas as pd
def time_series_resampling(file_path, frequency):
data = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
resampled_data = data.resample(frequency).mean()
print(resampled_data.head())
time_series_resampling('time_series_data.csv', 'M')
Resamples time series data to a specified frequency (e.g., monthly).
88. Plot Boxplot
import pandas as pd
import matplotlib.pyplot as plt
def plot_boxplot(file_path, column_name):
data = pd.read_csv(file_path)
plt.boxplot(data[column_name])
plt.title(f'Boxplot of {column_name}')
plt.ylabel(column_name)
plt.show()
plot_boxplot('data.csv', 'Value')
Plots a boxplot to visualize the distribution of a feature.
89. Calculate Rolling Mean and Standard Deviation
import pandas as pd
def rolling_mean_std(file_path, window_size):
data = pd.read_csv(file_path)
rolling_mean = data['Value'].rolling(window=window_size).mean()
rolling_std = data['Value'].rolling(window=window_size).std()
print(f"Rolling Mean:\n{rolling_mean}\n")
print(f"Rolling Std Dev:\n{rolling_std}\n")
rolling_mean_std('data.csv', 10)
Calculates and prints the rolling mean and standard deviation of a feature.
90. Chi-Square Test for Independence
import pandas as pd
from scipy.stats import chi2_contingency
def chi_square_test(file_path):
data = pd.read_csv(file_path)
contingency_table = pd.crosstab(data['Feature1'], data['Feature2'])
chi2, p, dof, expected = chi2_contingency(contingency_table)
print(f"Chi2 Statistic: {chi2}\nP-Value: {p}")
chi_square_test('data.csv')
Performs a Chi-Square test to determine if two categorical variables are independent.
91. Create and Evaluate Decision Tree
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
def decision_tree_evaluation(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
y = data['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")
decision_tree_evaluation('data.csv')
Builds and evaluates a Decision Tree classifier.
92. Evaluate Model with Precision, Recall, F1-Score
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
def precision_recall_f1(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
y = data['Target']
model = LogisticRegression()
model.fit(X, y)
predictions = model.predict(X)
report = classification_report(y, predictions)
print(report)
precision_recall_f1('data.csv')
Evaluates model performance using precision, recall, and F1-score.
93. Generate Synthetic Data
import pandas as pd
from sklearn.datasets import make_classification
def generate_synthetic_data():
X, y = make_classification(n_samples=1000, n_features=10, n_classes=2)
data = pd.DataFrame(X, columns=[f'Feature{i+1}' for i in range(X.shape[1])])
data['Target'] = y
data.to_csv('synthetic_data.csv', index=False)
print("Synthetic data generated and saved to 'synthetic_data.csv'.")
generate_synthetic_data()
Generates synthetic classification data and saves it to a CSV file.
94. Analyze Data Distribution
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
def analyze_data_distribution(file_path):
data = pd.read_csv(file_path)
sns.histplot(data['Value'], kde=True)
plt.title('Distribution of Value')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()
analyze_data_distribution('data.csv')
Analyzes and visualizes the distribution of a numerical feature using a histogram with KDE.
95. Plot Heatmap of Correlation Matrix
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
def plot_heatmap(file_path):
data = pd.read_csv(file_path)
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()
plot_heatmap('data.csv')
Plots a heatmap of the correlation matrix to visualize relationships between features.
96. Feature Engineering: Creating Interaction Features
import pandas as pd
def create_interaction_features(file_path):
data = pd.read_csv(file_path)
data['Feature_Interaction'] = data['Feature1'] * data['Feature2']
print(data.head())
create_interaction_features('data.csv')
Creates new features by interacting (multiplying) existing features.
97. Plot Time Series with Multiple Series
import pandas as pd
import matplotlib.pyplot as plt
def plot_time_series_multiple(file_path):
data = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
data.plot()
plt.title('Multiple Time Series')
plt.xlabel('Date')
plt.ylabel('Value')
plt.show()
plot_time_series_multiple('time_series_data.csv')
Plots multiple time series on the same graph.
98. Calculate Mean Absolute Error (MAE)
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
def calculate_mae(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
y = data['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
print(f"Mean Absolute Error: {mae}")
calculate_mae('data.csv')
Calculates the Mean Absolute Error of a regression model.
99. Compute and Plot Confusion Matrix
import pandas as pd
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
def plot_confusion_matrix(file_path):
data = pd.read_csv(file_path)
X = data.drop('Target', axis=1)
y = data['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
model = LogisticRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
cm = ConfusionMatrixDisplay.from_estimator(model, X_test, y_test)
cm.plot()
plt.title('Confusion Matrix')
plt.show()
plot_confusion_matrix('data.csv')
Computes and plots a confusion matrix for model evaluation.
100. Calculate Feature Pairwise Relationships
import pandas as pd
def pairwise_relationships(file_path):
data = pd.read_csv(file_path)
pairwise_corr = data.corr()
print("Pairwise Correlations:\n", pairwise_corr)
pairwise_relationships('data.csv')
Calculates and displays pairwise relationships between all features in the dataset.
This collection of 100 Python scripts for beginners is designed to enhance your understanding of data analysis with practical examples.
Post Comment