import pandas as pd from typing import List from openpyxl import Workbook def strip_character(column_name, characters: List[str]): new_col_name = column_name for character in characters: new_col_name = new_col_name.replace(character, '') new_col_name = new_col_name.strip() return new_col_name def refine_content(df): strip_character_list = [' ', '\n', ':', ':','其他'] for col in df.columns: df[col] = df[col].apply(lambda x: "其他" if strip_character(x, strip_character_list) == "" else strip_character(x, strip_character_list)) return df def get_acv_distribution(df, acv_name, industry_col_name): # Define the bins for ACV intervals bins = [0, 1e6, 5e6, float('inf')] labels = ['<100万', '100万-500万', '>500万'] # Create a new column 'ACV Interval' based on the bins df['ACV Interval'] = pd.cut(df[acv_name], bins=bins, labels=labels, right=False) industry_acv_distribution = df.groupby([industry_col_name, 'ACV Interval']).size().unstack(fill_value=0) industry_acv_distribution.loc['Total'] = industry_acv_distribution.sum() return industry_acv_distribution # 新增函数:将结果保存到Excel def save_to_excel(dfs, sheet_names, output_file): with pd.ExcelWriter(output_file, engine='openpyxl') as writer: for df, sheet_name in zip(dfs, sheet_names): df.to_excel(writer, sheet_name=sheet_name) # 读取数据 df = pd.read_excel('./data_src/pingcap_won.xlsx') df_pipeline = pd.read_excel('./data_src/pingcap_pipeline.xlsx') # 获取各种分布 won_industry_dist = get_acv_distribution(df, 'ACV', '客户分类') won_sub_industry_dist = get_acv_distribution(df, 'ACV', '客户行业') pipeline_industry_dist = get_acv_distribution(df_pipeline, '预估 ACV', '负责人所属行业') pipeline_sub_industry_dist = get_acv_distribution(df_pipeline, '预估 ACV', '客户行业') # 保存结果到Excel dfs = [won_industry_dist, won_sub_industry_dist, pipeline_industry_dist, pipeline_sub_industry_dist] sheet_names = ['成单-行业分布', '成单-子行业分布', 'Pipeline-行业分布', 'Pipeline-子行业分布'] save_to_excel(dfs, sheet_names, './output/acv_distribution.xlsx') print("ACV distribution analysis completed. Results saved in './output/acv_distribution.xlsx'")