import pandas as pd
from typing import List

def strip_character(column_name, characters: List[str]):

    new_col_name = column_name
    for character in characters:
        new_col_name = new_col_name.replace(character, '')
    new_col_name = new_col_name.strip()
    return new_col_name

def refine_content(df):
    strip_character_list = [' ', '\n', ':', '：','其他']
    for col in df.columns:
        df[col] = df[col].apply(lambda x: "其他" if strip_character(x, strip_character_list) == "" else strip_character(x, strip_character_list))
    return df

def get_acv_distribution(df, acv_name, industry_col_name):
    # Define the bins for ACV intervals
    bins = [0, 1e6, 5e6, float('inf')]
    labels = ['<100万', '100万-500万', '>500万']
    # Create a new column 'ACV Interval' based on the bins
    df['ACV Interval'] = pd.cut(df[acv_name], bins=bins, labels=labels, right=False)
    industry_acv_distribution = df.groupby([industry_col_name, 'ACV Interval']).size().unstack(fill_value=0)
    industry_acv_distribution.loc['Total'] = industry_acv_distribution.sum()
    return industry_acv_distribution

# Define the bins for ACV intervals
df = pd.read_excel('./data_src/pingcap_won.xlsx')


print('---------成单:ACV Distribution by industry--------')

print(get_acv_distribution(df, 'ACV', '客户分类'))


print('---------成单:ACV Distribution by sub-industry--------')

print(get_acv_distribution(df, 'ACV', '客户行业'))


df_pipeline = pd.read_excel('./data_src/pingcap_pipeline.xlsx')

print('---------Pipeline:ACV Distribution by industry--------')

print(get_acv_distribution(df_pipeline, '预估 ACV', '负责人所属行业'))


print('---------Pipeline:ACV Distribution by sub-industry--------')

print(get_acv_distribution(df_pipeline, '预估 ACV', '客户行业'))