Code
# The echo: false option disables the printing of code (only output is displayed).
with open("data/fulltext_simplified.txt", "r", encoding="utf-8") as file:
= file.read() main_text
dataviz; data visualization; Dream of the Red Chamber; 红楼梦
This paper presents data visualizations focusing on the three main protagonists of Dream of the Red Chamber (红楼梦), a classic work of Chinese literature. It presents the results of some experiments I conducted using Python in a Quarto environment with Jupyter.
Importing fulltext_simplified.txt, a text file retrieved from lilesIII. It is written with simplified chinese characters.
# The echo: false option disables the printing of code (only output is displayed).
with open("data/fulltext_simplified.txt", "r", encoding="utf-8") as file:
= file.read() main_text
Putting the content into a panda dataframe and shampooing the data :
Chapter_num | Chapter | RawContent |
---|---|---|
integer | extracted from header of the chapter | full text of the chapter |
We construct a list of name variants and synonyms for the three main protagonists—Jia Baoyu, Lin Daiyu, and Xue Baochai—and quantify their occurrences throughout the 120 chapters.
# Count how many times the three main protagonists are counted in RawContent
# Synonyms are enriched thanks to Wikidata (in labels).
# List of Jia Baoyu's synonyms https://www.wikidata.org/wiki/Q8428650
= [
jia_baoyu_synonyms '贾宝玉', '賈寶玉', '寶玉', '寶二爺', '怡紅公子',
'絳洞花王', '富貴閒人', '宝玉', '宝二爷',
'絳洞花主', '绛洞花主', '怡红公子', '绛洞花王'
]
# Join them into a regex pattern
= '|'.join(jia_baoyu_synonyms)
jia_baoyu_pattern
# Use in your count
'count_JiaBaoyu'] = df_cn['RawContent'].str.count(jia_baoyu_pattern)
df_cn[
# List of Lin Daiyu's synonyms
= [
lin_daiyu_synonyms '林黛玉', '黛玉', '瀟湘妃子', '顰兒', '颦儿',
'林姑娘', '林妹妹', '潇湘妃子', "玉儿", "颦颦",
]
# Join them into a regex pattern
= '|'.join(lin_daiyu_synonyms)
lin_daiyu_pattern
# Use in your count
'count_LinDaiyu'] = df_cn['RawContent'].str.count(lin_daiyu_pattern)
df_cn[
# List of Xue Baochai's synonyms
= [
xue_baochai_synonyms '薛寶釵', '薛宝钗', '寶釵', '宝钗',
'蘅蕪君', '蘅芜君', '寶姑娘', '宝姑娘',
'寶丫頭', '宝丫头', '寶姐姐', '宝姐姐'
]
# Join them into a regex pattern
= '|'.join(xue_baochai_synonyms)
xue_baochai_pattern
# Use in your count
'count_XueBaochai'] = df_cn['RawContent'].str.count(xue_baochai_pattern) df_cn[
A chapter-by-chapter heatmap is generated, where color intensity corresponds to the frequency of mentions of any of the three main protagonists.
#Note that we included the cell option fold: true to hide the code by default (click the Code button to show it).
# pip install scikit-learn jieba seaborn matplotlib
# Draw a heat map for each entry and the values of the columns
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
# Set Chapter_num as index for labeling rows in the heatmap
= df_cn.set_index('Chapter_num')[['count_JiaBaoyu', 'count_LinDaiyu', 'count_XueBaochai']]
heatmap_data
# Create the heatmap
# Plot the heatmap without labels or annotations
=(10, 6))
plt.figure(figsize
sns.heatmap(
heatmap_data,=False, # No numbers inside cells
annot='RdPu', # palette of colors
cmap=0.5,
linewidths=True,
cbar=True, # column names
xticklabels=False # No row labels
yticklabels
)
# Set y-ticks at intervals of 25
= heatmap_data.shape[0]
num_rows = np.arange(0, num_rows, 25)
ticks + 0.5, ticks + 1) # +0.5 centers ticks in cells, +1 shifts to 1-based indexing
plt.yticks(ticks
'«Dream of the Red Chamber» Three Main Protagonists \n(Number of Mentions per Chapter)')
plt.title('Chapter Number')
plt.ylabel('') # Remove the x-axis label
plt.xlabel(
plt.tight_layout()'images/characters_count_hlm.png', dpi=300, bbox_inches='tight')
plt.savefig( plt.show()
It’s not very readable, but it gives a rough idea of which parts of the novel involve interactions between the protagonists.
# Set up data
= df_cn['Chapter_num']
chapter_nums = df_cn['count_JiaBaoyu']
counts_baoyu = df_cn['count_LinDaiyu']
counts_daiyu = df_cn['count_XueBaochai']
counts_baochai
# Set figure size
=(12, 6))
plt.figure(figsize
# Plot stacked bars
='Jia Baoyu', color='mediumvioletred')
plt.bar(chapter_nums, counts_baoyu, label=counts_baoyu, label='Lin Daiyu', color='orchid')
plt.bar(chapter_nums, counts_daiyu, bottom
plt.bar(
chapter_nums,
counts_baochai,=counts_baoyu + counts_daiyu,
bottom='Xue Baochai',
label='plum'
color
)
# Labels and legend
'«Dream of the Red Chamber» Three Main Protagonists\n(Number of Mentions per Chapter)')
plt.title('Chapter Number')
plt.xlabel('Number of Mentions')
plt.ylabel(
plt.legend()
plt.tight_layout()
# Save and show
'images/characters_count_stackedbars.png', dpi=300, bbox_inches='tight')
plt.savefig( plt.show()
This one is not stacked, so it’s more readable.
# Set up data
= df_cn['Chapter_num']
chapter_nums = df_cn['count_JiaBaoyu']
counts_baoyu = df_cn['count_LinDaiyu']
counts_daiyu = df_cn['count_XueBaochai']
counts_baochai
# Set figure and axes
= plt.subplots(3, 1, figsize=(12, 10), sharex=True)
fig, axs
# Jia Baoyu
0].bar(chapter_nums, counts_baoyu, color='mediumvioletred')
axs[0].set_title('Jia Baoyu – Number of Mentions per Chapter')
axs[0].set_ylabel('Mentions')
axs[
# Lin Daiyu
1].bar(chapter_nums, counts_daiyu, color='orchid')
axs[1].set_title('Lin Daiyu – Number of Mentions per Chapter')
axs[1].set_ylabel('Mentions')
axs[
# Xue Baochai
2].bar(chapter_nums, counts_baochai, color='plum')
axs[2].set_title('Xue Baochai – Number of Mentions per Chapter')
axs[2].set_ylabel('Mentions')
axs[2].set_xlabel('Chapter Number')
axs[
# Tweak layout
'«Dream of the Red Chamber» – Character Mentions per Chapter', fontsize=16, y=1.02)
plt.suptitle(
plt.tight_layout()'images/characters_count_separate_bars.png', dpi=300, bbox_inches='tight')
plt.savefig( plt.show()
Threshold (maximum distance between two names mentioned) = 20 words.
# Count how many times there is a jia_baoyu_synonyms AND a lin_daiyu_synonyms within 20/threshold characters.
= 20
threshold
# Compile regex patterns (non-capturing groups for clarity)
= r'(?:' + '|'.join(jia_baoyu_synonyms) + r')'
jia_baoyu_pattern = r'(?:' + '|'.join(lin_daiyu_synonyms) + r')'
lin_daiyu_pattern
# Pattern: 贾宝玉 followed by ≤20/threshold chars then 林黛玉, OR 林黛玉 followed by ≤20/threshold chars then 贾宝玉
= rf'{jia_baoyu_pattern}.{{0,{threshold}}}{lin_daiyu_pattern}|{lin_daiyu_pattern}.{{0,{threshold}}}{jia_baoyu_pattern}'
jia_lin_pattern
# Function to count matches in a string
def count_jia_lin_pairs(text):
return len(re.findall(jia_lin_pattern, text))
# Apply to the DataFrame
'JiaBaoyu_LinDaiyu'] = df_cn['RawContent'].apply(count_jia_lin_pairs)
df_cn[
# count pairs of jia et xue
= r'(?:' + '|'.join(xue_baochai_synonyms) + r')'
xue_baochai_pattern
# Pattern: 贾宝玉 followed by ≤20/threshold chars then 林黛玉, OR 林黛玉 followed by ≤20/threshold chars then 贾宝玉
= rf'{jia_baoyu_pattern}.{{0,{threshold}}}{xue_baochai_pattern}|{xue_baochai_pattern}.{{0,{threshold}}}{jia_baoyu_pattern}'
jia_xue_pattern
# Function to count matches in a string
def count_jia_xue_pairs(text):
return len(re.findall(jia_xue_pattern, text))
# Apply to the DataFrame
'JiaBaoyu_XueBaochai'] = df_cn['RawContent'].apply(count_jia_xue_pairs)
df_cn[
# count pairs of lin et xue
= rf'{lin_daiyu_pattern}.{{0,{threshold}}}{xue_baochai_pattern}|{xue_baochai_pattern}.{{0,{threshold}}}{lin_daiyu_pattern}'
lin_xue_pattern
# Function to count matches in a string
def count_lin_xue_pairs(text):
return len(re.findall(lin_xue_pattern, text))
# Apply to the DataFrame
'LinDaiyu_XueBaochai'] = df_cn['RawContent'].apply(count_lin_xue_pairs) df_cn[
# Set Chapter_num as index for labeling rows in the heatmap
= df_cn.set_index('Chapter_num')[['JiaBaoyu_LinDaiyu', 'JiaBaoyu_XueBaochai','LinDaiyu_XueBaochai']]
heatmap_data
# # Create the heatmap
# Plot the heatmap without labels or annotations
=(10, 6))
plt.figure(figsize
sns.heatmap(
heatmap_data,=False, # No numbers inside cells
annot='RdPu',
cmap=0.5,
linewidths=True,
cbar=True, # column names
xticklabels=False # No row labels
yticklabels
)
# Set y-ticks at intervals of 25
= heatmap_data.shape[0]
num_rows = np.arange(0, num_rows, 25)
ticks + 0.5, ticks + 1) # +0.5 centers ticks in cells, +1 shifts to 1-based indexing
plt.yticks(ticks
'«Dream of the Red Chamber» Pairs of Protagonists \n Mentioned Together (per Chapter)')
plt.title('Chapter Number')
plt.ylabel('') # Remove the x-axis label
plt.xlabel(
plt.tight_layout()'images/pairs_characters_hlm.png', dpi=300, bbox_inches='tight')
plt.savefig( plt.show()
From now on, I will not use manual synonyms like before because it’s too heavy for managing a high number of characters.
From lileslll, I found the file userdict.json
. It contains 165 characters of the novel and theirs nicknames… I rename it userdict_simplified.json
.
I filter the corpus by removing all words not found in userdict_simplified.json
, allowing us to build a simple similarity matrix based solely on the characters mentioned in each chapter.
import jieba
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Load the user dictionary
with open("data/userdict_simplified.json", "r", encoding="utf-8") as f:
= json.load(f)
user_dict
# Flatten all names and synonyms into a set for fast lookup
= set(user_dict.keys())
valid_names for synonyms in user_dict.values():
valid_names.update(synonyms)
import jieba
def extract_only_names(text):
= jieba.lcut(text)
words = [word for word in words if word in valid_names]
names return ' '.join(names)
# Apply to create new column
'OnlyNames'] = df_cn['RawContent'].apply(extract_only_names)
df_cn[
# TF-IDF Vectorization on segmented text
= TfidfVectorizer()
vectorizer = vectorizer.fit_transform(df_cn['OnlyNames'])
tfidf_matrix
# Cosine similarity between chapters
= cosine_similarity(tfidf_matrix)
similarity_matrix
# Format as DataFrame for heatmap
= pd.DataFrame(
similarity_df
similarity_matrix,=[f"{idx+1}" for idx in range(len(df_cn))],
index=[f"{idx+1}" for idx in range(len(df_cn))]
columns
)
# Plot heatmap
=(12, 10))
plt.figure(figsize
sns.heatmap(similarity_df, ='RdPu',
cmap=0,
vmin=np.eye(similarity_df.shape[0]),
mask=1,
vmax=0.2)
linewidths
"Chapter Similarity Matrix (based on a selection of characters)")
plt.title("Chapters", ha='right')
plt.xlabel("Chapters")
plt.ylabel(
plt.tight_layout()'images/similarity_matrix2.png', dpi=300, bbox_inches='tight')
plt.savefig( plt.show()
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\martinop\AppData\Local\Temp\jieba.cache
Loading model cost 0.859 seconds.
Prefix dict has been built successfully.
Put the userdic_simplified into a dataframe with two columns : Name and NamePattern.
{
"贾宝玉": [
"贾宝玉",
"宝玉",
"宝二爷",
"怡红公子",
"绛洞花主",
"宝兄弟"
],
"林黛玉": [
"林黛玉",
The first key will be Name and the second key will be NamePattern. These keys become the columns of the dataframe. A 3rd column, Count, stores the number of mentions.
Name | NamePattern | Count |
---|---|---|
贾宝玉 | 贾宝玉 | |
贾宝玉 | 宝玉 | |
贾宝玉 | 宝二爷 |
import json
import pandas as pd
# 1. Load the JSON
with open('data/userdict_simplified.json', 'r', encoding='utf-8') as f:
= json.load(f)
data
# 2. Extract all values into a list
= []
rows for id_key, list_of_keywords in data.items():
for keyword in list_of_keywords:
'Name': id_key, 'NamePattern': keyword})
rows.append({
# 3. Create a DataFrame
= pd.DataFrame(rows)
df_keywords
# print(df_keywords)
# Now count
# Initialize a new column for counts
'Count'] = 0
df_keywords[
# Loop through each NamePattern
for idx, row in df_keywords.iterrows():
= row['NamePattern'].lower() # Lowercase for case-insensitive matching
pattern = df_cn['RawContent'].dropna().str.lower().str.count(pattern).sum()
count 'Count'] = count
df_keywords.at[idx,
#print(df_keywords)
# Export the result for control
'temp/count_keywords.csv', index=False)
df_keywords.to_csv(
# Group by Name, then sum the counts
= df_keywords.groupby('Name', as_index=False)['Count'].sum()
total_per_Name
# Sort by Count descending
= total_per_Name.sort_values(by='Count', ascending=False)
total_per_Name
#print(total_per_Name)
# Export the result for control
'temp/total_per_Name.csv', index=False) total_per_Name.to_csv(
To improve readability, I display the results of the count using a simple visualization, showing only the top 20 most frequently mentioned characters.
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
# Use SimHei font for Chinese support on Windows
= "C:/Windows/Fonts/simhei.ttf"
font_path = fm.FontProperties(fname=font_path)
zh_font
# Select top 20
= total_per_Name.head(20)
top_20
# Plot
=(12, 8))
plt.figure(figsize
# bar management
= plt.barh(top_20["Name"], top_20["Count"], color="lightpink")
bars # Add count values to the right of each bar
for bar in bars:
= bar.get_width()
width + 1, bar.get_y() + bar.get_height() / 2,
plt.text(width str(width), va='center', fontsize=10, fontproperties=zh_font)
"Name"], top_20["Count"], color="lightpink")
plt.barh(top_20["Count", fontproperties=zh_font)
plt.xlabel("Name", fontproperties=zh_font)
plt.ylabel("Top 20 Most Mentioned Names")
plt.title(# So the highest is at the top
plt.gca().invert_yaxis() #plt.tight_layout()
=zh_font)
plt.yticks(fontproperties=zh_font)
plt.xticks(fontproperties plt.show()
That’s ambitious. I will count how many times a pair of characters (story characters) are mentioned together. I choose a threshold of 20 characters (written characters) between two mentioned names.
Expanding userdict_simplified.json to add the Chapter_num where the NamePattern is mentioned and the numerical position of each match.
Name | NamePattern | Chapter_num | Position in RawContent |
---|---|---|---|
贾宝玉 | 贾宝玉 | 1 | 3650 |
贾宝玉 | 宝玉 | 6 | 5896 |
… | … |
= []
results
# Iterate over each row in df
for i, row in df_cn.iterrows():
= row['Chapter_num']
chapter_num = row['RawContent']
raw_content
# Check each keyword pattern against this content
for _, kw_row in df_keywords.iterrows():
= kw_row['Name']
name = kw_row['NamePattern']
pattern
# Find all matches of the pattern in the content
for match in re.finditer(re.escape(pattern), raw_content):
results.append({'Name': name,
'NamePattern': pattern,
'Chapter_num': chapter_num,
'Position': match.start()
})
# Convert the results into a new DataFrame
= pd.DataFrame(results)
df_matches
#print(df_matches)
# Export for control
'temp/count_char_position.csv', index=False) df_matches.to_csv(
= df_matches.groupby('Name').size().reset_index(name='Count')
name_counts = name_counts.sort_values(by='Count', ascending=False)
name_counts
#print(name_counts)
#print(total_per_Name)
Checked ✅
Now i want a matrix of NamePattern that count characters presents in the same chapter.
NamePattern1 | NamePattern2 | NamePattern3 | … | |
---|---|---|---|---|
NamePattern1 | 4 | 0 | 2 | |
NamePattern2 | 0 | 3 | 5 | |
NamePattern3 | 2 | 5 | 3 | |
… |
The following code took 12 hours (!) to complete on my laptop. ⬇️
""" UNCOMMENT TO REACTIVATE
import pandas as pd
import numpy as np
# Get unique NamePatterns
name_patterns = df_matches['NamePattern'].unique()
# Initialize the matrix with zeros
matrix = pd.DataFrame(0, index=name_patterns, columns=name_patterns)
# Iterate through each pair of matches
for i, row_i in df_matches.iterrows():
for j, row_j in df_matches.iterrows():
if i >= j:
continue # Avoid duplicate and self comparisons
# Check if in same chapter
if row_i['Chapter_num'] == row_j['Chapter_num']:
# Check if within +/- 20 positions
if abs(row_i['Position'] - row_j['Position']) <= 20:
matrix.loc[row_i['NamePattern'], row_j['NamePattern']] += 1
matrix.loc[row_j['NamePattern'], row_i['NamePattern']] += 1 # symmetric
# Export for control
matrix.to_csv('temp/matrix_proximity.csv', index=True)
"""
" UNCOMMENT TO REACTIVATE \n\nimport pandas as pd\nimport numpy as np\n\n# Get unique NamePatterns\nname_patterns = df_matches['NamePattern'].unique()\n\n# Initialize the matrix with zeros\nmatrix = pd.DataFrame(0, index=name_patterns, columns=name_patterns)\n\n# Iterate through each pair of matches\nfor i, row_i in df_matches.iterrows():\n for j, row_j in df_matches.iterrows():\n if i >= j:\n continue # Avoid duplicate and self comparisons\n\n # Check if in same chapter\n if row_i['Chapter_num'] == row_j['Chapter_num']:\n # Check if within +/- 20 positions\n if abs(row_i['Position'] - row_j['Position']) <= 20:\n matrix.loc[row_i['NamePattern'], row_j['NamePattern']] += 1\n matrix.loc[row_j['NamePattern'], row_i['NamePattern']] += 1 # symmetric\n \n# Export for control\nmatrix.to_csv('temp/matrix_proximity.csv', index=True)\n\n"
Reading the csv previously generated (to avoid recalculating)
import pandas as pd
# Load the matrix from CSV
= pd.read_csv('temp/matrix_proximity.csv', index_col=0) matrix
Finalizing counting of proximity by merging the NamePattern (adding their values) depending on their respective Name.
# Replace NamePattern by the associated Name
# Create mapping from NamePattern to Name
= df_matches.drop_duplicates('NamePattern').set_index('NamePattern')['Name']
name_map
= matrix
matrix_renamed
# Rename rows and columns
=name_map, columns=name_map, inplace=True)
matrix_renamed.rename(index
# Merge all identical Name rows and columns to aggregate the previous synonimous in NamePattern
# Group and sum rows with the same name
= matrix_renamed.groupby(level=0).sum()
matrix_renamed
# Group and sum columns with the same name (use transpose instead of axis=1)
= matrix_renamed.T.groupby(level=0).sum().T
matrix_renamed
# Give a value of 0 for the intersection of column = row
0)
np.fill_diagonal(matrix_renamed.values,
# Export for control
'temp/matrix_renamed_proximity.csv', index=True) matrix_renamed.to_csv(
Keeping only the highest 20 most proximities
# Compute sum for each row/column
= matrix_renamed.sum(axis=1)
row_sums = matrix_renamed.sum(axis=0)
col_sums
# Get top 20 names by total connections
= row_sums.add(col_sums, fill_value=0).sort_values(ascending=False).head(20).index
top_names
# Filter matrix to top 20 rows and columns
= matrix_renamed.loc[top_names, top_names]
matrix_renamed20
# Export for control
'temp/matrix_renamed20_proximity.csv', index=True) matrix_renamed20.to_csv(
It represents the proximity between characters throughout the entire novel. We observe that Jia Baoyu interacts more frequently with Lin Daiyu than with Xue Baochai. He also has numerous interactions with Grandmother Jia, Lady Wang, and Jia Xiren.
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
# Use SimHei font for Chinese support on Windows
= "C:/Windows/Fonts/simhei.ttf"
font_path = fm.FontProperties(fname=font_path)
zh_font
# Set up the figure
=(12, 10))
plt.figure(figsize
# Draw the heatmap
='Reds', linewidths=0.5, square=True, annot=True, fmt='d')
sns.heatmap(matrix_renamed20, cmap
# Add title and axis labels
"Co-occurrence Heatmap (Top 20 Names)", fontsize=16, fontproperties=zh_font)
plt.title("Name", ha='right', fontproperties=zh_font)
plt.xlabel("Name", fontproperties=zh_font)
plt.ylabel(
# Set tick labels with Chinese font
=45, ha='right', fontproperties=zh_font)
plt.xticks(rotation=0, fontproperties=zh_font)
plt.yticks(rotation
#plt.tight_layout()
plt.show()
Brown, J. Text Analysis and Data Visualization Assignment: Little Women. JBrown’s Blog (2010) https://jbrownsblog.wordpress.com/2010/11/29/text-analysis-and-data-visualization-exercise/
Heiss, A. PMAP 8921: Data Visualization (2020) https://datavizm20.classes.andrewheiss.com/example/13-example/, updated for 2025 https://datavizsp25.classes.andrewheiss.com/example/14-example.html
Wang, Z., Huang, D., Cui, J. et al. A review of Chinese sentiment analysis: subjects, methods, and trends. Artif Intell Rev 58, 75 (2025). https://doi.org/10.1007/s10462-024-10988-9