新增blog

2026-02-05 14:32:29 +08:00
parent 1f3e37fed1
commit ee576ac0db
1 changed files with 270 additions and 0 deletions
--- a/src/content/posts/note/index.md
+++ b/src/content/posts/note/index.md
@@ -0,0 +1,270 @@
+---
+title: 编程日记
+published: 2026-02-05
+pinned: false
+description: 编程过程中遇到的小细节
+tags: [Note]
+category: 学习日志
+draft: false
+---
+# xlsx表格读取
+
+当使用pandas读取xlsx表格时，如果表格中有日期格式的数据，读取时会看到日期被读取为数字，如44000。如何才能读取到原本显示的日期格式？
+
+解决方案：
+[扩展 openpyxl 对 Excel 中自定义单元格格式的处理 ](https://www.cnblogs.com/dyfblog/p/16339375.html)
+
+openpyxl.styles.numbers.py中有一组格式
+```python
+BUILTIN_FORMATS = {
+    0: 'General',
+    1: '0',
+    2: '0.00',
+    3: '#,##0',
+    4: '#,##0.00',
+    5: '"$"#,##0_);("$"#,##0)',
+    6: '"$"#,##0_);[Red]("$"#,##0)',
+    7: '"$"#,##0.00_);("$"#,##0.00)',
+    8: '"$"#,##0.00_);[Red]("$"#,##0.00)',
+    9: '0%',
+    10: '0.00%',
+    11: '0.00E+00',
+    12: '# ?/?',
+    13: '# ??/??',
+    14: 'mm-dd-yy',
+    15: 'd-mmm-yy',
+    16: 'd-mmm',
+    17: 'mmm-yy',
+    18: 'h:mm AM/PM',
+    19: 'h:mm:ss AM/PM',
+    20: 'h:mm',
+    21: 'h:mm:ss',
+    22: 'm/d/yy h:mm',
+
+    37: '#,##0_);(#,##0)',
+    38: '#,##0_);[Red](#,##0)',
+    39: '#,##0.00_);(#,##0.00)',
+    40: '#,##0.00_);[Red](#,##0.00)',
+
+    41: r'_(* #,##0_);_(* \(#,##0\);_(* "-"_);_(@_)',
+    42: r'_("$"* #,##0_);_("$"* \(#,##0\);_("$"* "-"_);_(@_)',
+    43: r'_(* #,##0.00_);_(* \(#,##0.00\);_(* "-"??_);_(@_)',
+
+    44: r'_("$"* #,##0.00_)_("$"* \(#,##0.00\)_("$"* "-"??_)_(@_)',
+    45: 'mm:ss',
+    46: '[h]:mm:ss',
+    47: 'mmss.0',
+    48: '##0.0E+0',
+    49: '@', 
+}
+```
+从文档中找到中文对应的格式 ID 和格式字符串的对应关系，然后采用 hook 的方式将其注入 openpyxl 模块中即可。（注意这些代码需要在导入 openpyxl 模块之前执行）
+```python
+# 扩展openpyxl的数字格式
+# 此处扩展的是中文格式
+extra_formats = {
+    27: 'yyyy"年"m"月"',
+    28: 'm"月"d"日"',
+    29: 'm"月"d"日"',
+    30: "m-d-yy",
+    31: 'yyyy"年"m"月"d"日"',
+    32: 'h"时"mm"分"',
+    33: 'h"时"mm"分"ss"秒"',
+    34: '上午/下午h"时"mm"分"',
+    35: '上午/下午h"时"mm"分"ss"秒"',
+    36: 'yyyy"年"m"月"',
+    #
+    50: 'yyyy"年"m"月"',
+    51: 'm"月"d"日"',
+    52: 'yyyy"年"m"月"',
+    53: 'm"月"d"日"',
+    54: 'm"月"d"日"',
+    55: '上午/下午h"时"mm"分"',
+    56: '上午/下午h"时"mm"分"ss"秒"',
+    57: 'yyyy"年"m"月"',
+    58: 'm"月"d"日"',
+}
+from openpyxl.styles.numbers import BUILTIN_FORMATS
+
+BUILTIN_FORMATS.update(extra_formats)
+```
+示例代码：
+```python
+"""
+使用扩展的中文数字格式读取Excel文件
+扩展openpyxl的数字格式以支持中文日期时间格式
+"""
+
+from openpyxl import load_workbook
+from openpyxl.styles.numbers import BUILTIN_FORMATS
+import os
+
+# 扩展openpyxl的数字格式
+# 此处扩展的是中文格式
+extra_formats = {
+    27: 'yyyy"年"m"月"',
+    28: 'm"月"d"日"',
+    29: 'm"月"d"日"',
+    30: "m-d-yy",
+    31: 'yyyy"年"m"月"d"日"',
+    32: 'h"时"mm"分"',
+    33: 'h"时"mm"分"ss"秒"',
+    34: '上午/下午h"时"mm"分"',
+    35: '上午/下午h"时"mm"分"ss"秒"',
+    36: 'yyyy"年"m"月"',
+    #
+    50: 'yyyy"年"m"月"',
+    51: 'm"月"d"日"',
+    52: 'yyyy"年"m"月"',
+    53: 'm"月"d"日"',
+    54: 'm"月"d"日"',
+    55: '上午/下午h"时"mm"分"',
+    56: '上午/下午h"时"mm"分"ss"秒"',
+    57: 'yyyy"年"m"月"',
+    58: 'm"月"d"日"',
+}
+
+# 更新内置格式
+BUILTIN_FORMATS.update(extra_formats)
+
+
+def read_xlsx_file(file_path):
+    """
+    读取Excel文件并输出详细信息
+    
+    Args:
+        file_path: Excel文件路径
+    """
+    if not os.path.exists(file_path):
+        print(f"文件不存在: {file_path}")
+        return
+    
+    print(f"正在读取文件: {file_path}")
+    print("=" * 60)
+    
+    try:
+        # 加载工作簿
+        wb = load_workbook(file_path, data_only=False)
+        
+        print(f"文件加载成功")
+        print(f"工作表数量: {len(wb.sheetnames)}")
+        print(f"工作表名称: {wb.sheetnames}")
+        print("=" * 60)
+        
+        # 遍历所有工作表
+        for sheet_name in wb.sheetnames:
+            print(f"\n工作表: {sheet_name}")
+            print("-" * 60)
+            
+            ws = wb[sheet_name]
+            
+            # 获取工作表维度
+            if ws.max_row > 0 and ws.max_column > 0:
+                print(f"数据范围: {ws.max_row} 行 × {ws.max_column} 列")
+                
+                # 读取前10行数据（或所有数据，如果少于10行）
+                max_display_rows = min(100, ws.max_row)
+                
+                print(f"\n数据内容（前{max_display_rows}行）:")
+                print("-" * 60)
+                
+                for row_idx, row in enumerate(ws.iter_rows(min_row=1, max_row=max_display_rows, values_only=False), start=1):
+                    row_data = []
+                    for cell in row:
+                        # 获取单元格值
+                        cell_value = cell.value
+                        
+                        # 获取单元格格式信息
+                        cell_info = {
+                            'value': cell_value,
+                            'coordinate': cell.coordinate,
+                        }
+                        
+                        # 如果有数字格式，获取格式信息
+                        if cell.number_format:
+                            cell_info['number_format'] = cell.number_format
+                            
+                            # 尝试从BUILTIN_FORMATS中查找格式描述
+                            if cell.number_format in BUILTIN_FORMATS.values():
+                                # 查找格式ID
+                                format_id = None
+                                for fmt_id, fmt_str in BUILTIN_FORMATS.items():
+                                    if fmt_str == cell.number_format:
+                                        format_id = fmt_id
+                                        break
+                                if format_id:
+                                    cell_info['format_id'] = format_id
+                        
+                        # 获取数据类型
+                        if cell_value is not None:
+                            cell_info['data_type'] = type(cell_value).__name__
+                        
+                        row_data.append(cell_info)
+                    
+                    # 输出行数据
+                    print(f"\n行 {row_idx}:")
+                    for cell_info in row_data:
+                        value_str = str(cell_info['value']) if cell_info['value'] is not None else '(空)'
+                        coord = cell_info['coordinate']
+                        
+                        info_parts = [f"  {coord}: {value_str}"]
+                        
+                        if 'data_type' in cell_info:
+                            info_parts.append(f"[类型: {cell_info['data_type']}]")
+                        
+                        if 'number_format' in cell_info:
+                            fmt = cell_info['number_format']
+                            if 'format_id' in cell_info:
+                                info_parts.append(f"[格式ID: {cell_info['format_id']}, 格式: {fmt}]")
+                            else:
+                                info_parts.append(f"[格式: {fmt}]")
+                        
+                        print(" ".join(info_parts))
+                
+                if ws.max_row > max_display_rows:
+                    print(f"\n... (还有 {ws.max_row - max_display_rows} 行未显示)")
+            else:
+                print("工作表为空")
+        
+        print("\n" + "=" * 60)
+        print("读取完成")
+        
+        # 关闭工作簿
+        wb.close()
+        
+    except Exception as e:
+        print(f"读取文件时发生错误: {str(e)}")
+        import traceback
+        traceback.print_exc()
+
+
+def main():
+    """主函数"""
+    # 默认读取example.xlsx，如果不存在则尝试其他文件
+    test_files = [
+        "test.xlsx"
+    ]
+    
+    # 查找存在的文件
+    file_to_read = None
+    for file in test_files:
+        if os.path.exists(file):
+            file_to_read = file
+            break
+    
+    if file_to_read:
+        read_xlsx_file(file_to_read)
+    else:
+        print("未找到可用的Excel文件")
+        print("请将Excel文件放在当前目录，或修改main()函数中的文件路径")
+        print(f"\n当前目录: {os.getcwd()}")
+        print(f"可用的xlsx文件:")
+        for root, dirs, files in os.walk('.'):
+            for file in files:
+                if file.endswith('.xlsx') and not file.startswith('~$'):
+                    print(f"  - {os.path.join(root, file)}")
+
+
+if __name__ == "__main__":
+    main()
+```