70 lines
3.1 KiB
Python
70 lines
3.1 KiB
Python
import pandas as pd
|
||
from typing import List
|
||
from openpyxl import Workbook
|
||
|
||
def strip_character(column_name, characters: List[str]):
|
||
|
||
new_col_name = column_name
|
||
for character in characters:
|
||
new_col_name = new_col_name.replace(character, '')
|
||
new_col_name = new_col_name.strip()
|
||
return new_col_name
|
||
|
||
def refine_content(df):
|
||
strip_character_list = [' ', '\n', ':', ':','其他']
|
||
for col in df.columns:
|
||
df[col] = df[col].apply(lambda x: "其他" if strip_character(x, strip_character_list) == "" else strip_character(x, strip_character_list))
|
||
return df
|
||
|
||
def get_acv_distribution(df, acv_name, industry_col_name):
|
||
# Define the bins for ACV intervals
|
||
bins = [0, 1e6, 5e6, float('inf')]
|
||
labels = ['<100万', '100万-500万', '>500万']
|
||
# Create a new column 'ACV Interval' based on the bins
|
||
df['ACV Interval'] = pd.cut(df[acv_name], bins=bins, labels=labels, right=False)
|
||
industry_acv_distribution = df.groupby([industry_col_name, 'ACV Interval']).size().unstack(fill_value=0)
|
||
industry_acv_distribution.loc['Total'] = industry_acv_distribution.sum()
|
||
return industry_acv_distribution
|
||
|
||
def get_acv_distribution_sum(df, acv_name, industry_col_name):
|
||
# Define the bins for ACV intervals
|
||
bins = [0, 1e6, 5e6, float('inf')]
|
||
labels = ['<100万', '100万-500万', '>500万']
|
||
|
||
# Create a new column 'ACV Interval' based on the bins
|
||
df['ACV Interval'] = pd.cut(df[acv_name], bins=bins, labels=labels, right=False)
|
||
|
||
# Group by industry and ACV interval, then sum the ACV values
|
||
industry_acv_sum = df.groupby([industry_col_name, 'ACV Interval'])[acv_name].sum().unstack(fill_value=0)
|
||
|
||
# Add a 'Total' row
|
||
industry_acv_sum.loc['Total'] = industry_acv_sum.sum()
|
||
|
||
return industry_acv_sum
|
||
|
||
# 新增函数:将结果保存到Excel
|
||
def save_to_excel(dfs, sheet_names, output_file):
|
||
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
|
||
for df, sheet_name in zip(dfs, sheet_names):
|
||
df.to_excel(writer, sheet_name=sheet_name)
|
||
|
||
# 读取数据
|
||
df = pd.read_excel('./data_src/pingcap_won.xlsx')
|
||
df_pipeline = pd.read_excel('./data_src/pingcap_pipeline.xlsx')
|
||
|
||
# 获取各种分布
|
||
won_industry_dist = get_acv_distribution(df, 'ACV', '客户分类')
|
||
won_sub_industry_dist = get_acv_distribution(df, 'ACV', '客户行业')
|
||
pipeline_industry_dist = get_acv_distribution(df_pipeline, '预估 ACV', '负责人所属行业')
|
||
pipeline_sub_industry_dist = get_acv_distribution(df_pipeline, '预估 ACV', '客户行业')
|
||
|
||
pipeline_sub_industry_dist_sum = get_acv_distribution_sum(df_pipeline, '预估 ACV', '客户行业')
|
||
pipeline_sub_industry_dist_sum.to_excel('./output/pipeline_sub_industry_dist_sum.xlsx')
|
||
|
||
|
||
# 保存结果到Excel
|
||
dfs = [won_industry_dist, won_sub_industry_dist, pipeline_industry_dist, pipeline_sub_industry_dist]
|
||
sheet_names = ['成单-行业分布', '成单-子行业分布', 'Pipeline-行业分布', 'Pipeline-子行业分布']
|
||
# save_to_excel(dfs, sheet_names, './output/acv_distribution.xlsx')
|
||
|
||
print("ACV distribution analysis completed. Results saved in './output/acv_distribution.xlsx'") |