prebid-version-popularity-c.../streamlit_app.py
2024-10-03 11:21:34 -04:00

161 lines
5.0 KiB
Python

import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import json
import re
from collections import Counter
# Load the JSON data from the uploaded file
def load_json(file):
try:
# Load JSON data into a Python list
data = json.load(file)
except json.JSONDecodeError:
st.error("The uploaded file is not a valid JSON.")
return None
return data
# Function to categorize versions into buckets
def categorize_version(version):
# Remove leading 'v' if present
if version.startswith('v'):
version = version[1:]
# Split the version into major and minor parts
version_parts = re.split(r'\.|-', version)
try:
major = int(version_parts[0])
minor = int(version_parts[1]) if len(version_parts) > 1 else 0
except ValueError:
return 'Other'
# Group versions based on major and minor version numbers
if major == 8:
if 0 <= minor <= 9:
return '8.0-8.9'
elif 10 <= minor <= 19:
return '8.10-8.19'
elif 20 <= minor <= 29:
return '8.20-8.29'
elif 30 <= minor <= 39:
return '8.30-8.39'
elif 40 <= minor <= 49:
return '8.40-8.49'
elif 50 <= minor <= 59:
return '8.50-8.59'
else:
return '8.60+'
elif major == 9:
if 0 <= minor <= 9:
return '9.0-9.9'
elif 10 <= minor <= 19:
return '9.10-9.19'
elif 20 <= minor <= 29:
return '9.20-9.29'
elif 30 <= minor <= 39:
return '9.30-9.39'
elif 40 <= minor <= 49:
return '9.40-9.49'
elif 50 <= minor <= 59:
return '9.50-9.59'
else:
return '9.60+'
elif major == 7:
if 0 <= minor <= 9:
return '7.0-7.9'
elif 10 <= minor <= 19:
return '7.10-7.19'
elif 20 <= minor <= 29:
return '7.20-7.29'
elif 30 <= minor <= 39:
return '7.30-7.39'
elif 40 <= minor <= 49:
return '7.40-7.49'
elif 50 <= minor <= 59:
return '7.50-7.59'
else:
return '7.60+'
elif major in range(3, 7):
return f'{major}.x'
elif major in range(0, 3):
return '0.x-2.x'
else:
return 'Other'
# Classify modules by type
def classify_module(module_name):
if 'BidAdapter' in module_name:
return 'Bid Adapter'
elif 'RtdProvider' in module_name or 'rtdModule' in module_name:
return 'RTD Module'
elif 'IdSystem' in module_name or 'userId' in module_name:
return 'ID System'
elif 'Analytics' in module_name or 'analyticsAdapter' in module_name:
return 'Analytics Adapter'
else:
return 'Other'
# Function to extract and classify modules
def extract_module_stats(data):
module_counter = {
'Bid Adapter': Counter(),
'RTD Module': Counter(),
'ID System': Counter(),
'Analytics Adapter': Counter(),
'Other': Counter()
}
for item in data:
# Safely get the 'modules' list or return an empty list if the key is missing
modules = item.get('modules', [])
for module in modules:
category = classify_module(module)
module_counter[category][module] += 1
return module_counter
# Create a bar chart of the version buckets
def create_version_chart(data):
# Extract and categorize versions
version_buckets = [categorize_version(item['version']) for item in data]
# Create a DataFrame and count occurrences of each version bucket
version_counts = pd.Series(version_buckets).value_counts().sort_index()
# Plot the bar chart
fig, ax = plt.subplots()
version_counts.plot(kind='bar', ax=ax)
ax.set_xlabel('Version Buckets')
ax.set_ylabel('Number of URLs')
ax.set_title('Number of URLs per Version Bucket')
plt.xticks(rotation=45)
st.pyplot(fig)
# Display the total number of sites
st.write(f"Total Number of Sites: {len(data)}")
# Function to display module statistics
def display_module_stats(module_stats):
for category, counter in module_stats.items():
st.subheader(f"{category} Popularity")
df = pd.DataFrame(counter.items(), columns=[category, 'Count'])
df = df.sort_values(by='Count', ascending=False).reset_index(drop=True)
st.table(df)
# Streamlit app
st.title('Version Popularity Chart (Grouped by Buckets)')
uploaded_file = st.file_uploader('Upload a JSON file eg https://github.com/prebid/prebid-integration-monitor/blob/main/output/10k.json', type='json')
if uploaded_file is not None:
data = load_json(uploaded_file)
if data: # Proceed only if there is valid data
# Filter out entries with more than 300 modules
filtered_data = [item for item in data if len(item.get('modules', [])) <= 300]
create_version_chart(filtered_data)
module_stats = extract_module_stats(filtered_data)
display_module_stats(module_stats)
else:
st.write("No valid data found in the uploaded file.")