diff --git a/src/content/posts/note/index.md b/src/content/posts/note/index.md new file mode 100644 index 0000000..bdff481 --- /dev/null +++ b/src/content/posts/note/index.md @@ -0,0 +1,270 @@ +--- +title: 编程日记 +published: 2026-02-05 +pinned: false +description: 编程过程中遇到的小细节 +tags: [Note] +category: 学习日志 +draft: false +--- +# xlsx表格读取 + +当使用pandas读取xlsx表格时,如果表格中有日期格式的数据,读取时会看到日期被读取为数字,如44000。如何才能读取到原本显示的日期格式? + +解决方案: +[扩展 openpyxl 对 Excel 中自定义单元格格式的处理 ](https://www.cnblogs.com/dyfblog/p/16339375.html) + +openpyxl.styles.numbers.py中有一组格式 +```python +BUILTIN_FORMATS = { + 0: 'General', + 1: '0', + 2: '0.00', + 3: '#,##0', + 4: '#,##0.00', + 5: '"$"#,##0_);("$"#,##0)', + 6: '"$"#,##0_);[Red]("$"#,##0)', + 7: '"$"#,##0.00_);("$"#,##0.00)', + 8: '"$"#,##0.00_);[Red]("$"#,##0.00)', + 9: '0%', + 10: '0.00%', + 11: '0.00E+00', + 12: '# ?/?', + 13: '# ??/??', + 14: 'mm-dd-yy', + 15: 'd-mmm-yy', + 16: 'd-mmm', + 17: 'mmm-yy', + 18: 'h:mm AM/PM', + 19: 'h:mm:ss AM/PM', + 20: 'h:mm', + 21: 'h:mm:ss', + 22: 'm/d/yy h:mm', + + 37: '#,##0_);(#,##0)', + 38: '#,##0_);[Red](#,##0)', + 39: '#,##0.00_);(#,##0.00)', + 40: '#,##0.00_);[Red](#,##0.00)', + + 41: r'_(* #,##0_);_(* \(#,##0\);_(* "-"_);_(@_)', + 42: r'_("$"* #,##0_);_("$"* \(#,##0\);_("$"* "-"_);_(@_)', + 43: r'_(* #,##0.00_);_(* \(#,##0.00\);_(* "-"??_);_(@_)', + + 44: r'_("$"* #,##0.00_)_("$"* \(#,##0.00\)_("$"* "-"??_)_(@_)', + 45: 'mm:ss', + 46: '[h]:mm:ss', + 47: 'mmss.0', + 48: '##0.0E+0', + 49: '@', +} +``` +从文档中找到中文对应的格式 ID 和格式字符串的对应关系,然后采用 hook 的方式将其注入 openpyxl 模块中即可。(注意这些代码需要在导入 openpyxl 模块之前执行) +```python +# 扩展openpyxl的数字格式 +# 此处扩展的是中文格式 +extra_formats = { + 27: 'yyyy"年"m"月"', + 28: 'm"月"d"日"', + 29: 'm"月"d"日"', + 30: "m-d-yy", + 31: 'yyyy"年"m"月"d"日"', + 32: 'h"时"mm"分"', + 33: 'h"时"mm"分"ss"秒"', + 34: '上午/下午h"时"mm"分"', + 35: '上午/下午h"时"mm"分"ss"秒"', + 36: 'yyyy"年"m"月"', + # + 50: 'yyyy"年"m"月"', + 51: 'm"月"d"日"', + 52: 'yyyy"年"m"月"', + 53: 'm"月"d"日"', + 54: 'm"月"d"日"', + 55: '上午/下午h"时"mm"分"', + 56: '上午/下午h"时"mm"分"ss"秒"', + 57: 'yyyy"年"m"月"', + 58: 'm"月"d"日"', +} +from openpyxl.styles.numbers import BUILTIN_FORMATS + +BUILTIN_FORMATS.update(extra_formats) +``` +示例代码: +```python +""" +使用扩展的中文数字格式读取Excel文件 +扩展openpyxl的数字格式以支持中文日期时间格式 +""" + +from openpyxl import load_workbook +from openpyxl.styles.numbers import BUILTIN_FORMATS +import os + +# 扩展openpyxl的数字格式 +# 此处扩展的是中文格式 +extra_formats = { + 27: 'yyyy"年"m"月"', + 28: 'm"月"d"日"', + 29: 'm"月"d"日"', + 30: "m-d-yy", + 31: 'yyyy"年"m"月"d"日"', + 32: 'h"时"mm"分"', + 33: 'h"时"mm"分"ss"秒"', + 34: '上午/下午h"时"mm"分"', + 35: '上午/下午h"时"mm"分"ss"秒"', + 36: 'yyyy"年"m"月"', + # + 50: 'yyyy"年"m"月"', + 51: 'm"月"d"日"', + 52: 'yyyy"年"m"月"', + 53: 'm"月"d"日"', + 54: 'm"月"d"日"', + 55: '上午/下午h"时"mm"分"', + 56: '上午/下午h"时"mm"分"ss"秒"', + 57: 'yyyy"年"m"月"', + 58: 'm"月"d"日"', +} + +# 更新内置格式 +BUILTIN_FORMATS.update(extra_formats) + + +def read_xlsx_file(file_path): + """ + 读取Excel文件并输出详细信息 + + Args: + file_path: Excel文件路径 + """ + if not os.path.exists(file_path): + print(f"文件不存在: {file_path}") + return + + print(f"正在读取文件: {file_path}") + print("=" * 60) + + try: + # 加载工作簿 + wb = load_workbook(file_path, data_only=False) + + print(f"文件加载成功") + print(f"工作表数量: {len(wb.sheetnames)}") + print(f"工作表名称: {wb.sheetnames}") + print("=" * 60) + + # 遍历所有工作表 + for sheet_name in wb.sheetnames: + print(f"\n工作表: {sheet_name}") + print("-" * 60) + + ws = wb[sheet_name] + + # 获取工作表维度 + if ws.max_row > 0 and ws.max_column > 0: + print(f"数据范围: {ws.max_row} 行 × {ws.max_column} 列") + + # 读取前10行数据(或所有数据,如果少于10行) + max_display_rows = min(100, ws.max_row) + + print(f"\n数据内容(前{max_display_rows}行):") + print("-" * 60) + + for row_idx, row in enumerate(ws.iter_rows(min_row=1, max_row=max_display_rows, values_only=False), start=1): + row_data = [] + for cell in row: + # 获取单元格值 + cell_value = cell.value + + # 获取单元格格式信息 + cell_info = { + 'value': cell_value, + 'coordinate': cell.coordinate, + } + + # 如果有数字格式,获取格式信息 + if cell.number_format: + cell_info['number_format'] = cell.number_format + + # 尝试从BUILTIN_FORMATS中查找格式描述 + if cell.number_format in BUILTIN_FORMATS.values(): + # 查找格式ID + format_id = None + for fmt_id, fmt_str in BUILTIN_FORMATS.items(): + if fmt_str == cell.number_format: + format_id = fmt_id + break + if format_id: + cell_info['format_id'] = format_id + + # 获取数据类型 + if cell_value is not None: + cell_info['data_type'] = type(cell_value).__name__ + + row_data.append(cell_info) + + # 输出行数据 + print(f"\n行 {row_idx}:") + for cell_info in row_data: + value_str = str(cell_info['value']) if cell_info['value'] is not None else '(空)' + coord = cell_info['coordinate'] + + info_parts = [f" {coord}: {value_str}"] + + if 'data_type' in cell_info: + info_parts.append(f"[类型: {cell_info['data_type']}]") + + if 'number_format' in cell_info: + fmt = cell_info['number_format'] + if 'format_id' in cell_info: + info_parts.append(f"[格式ID: {cell_info['format_id']}, 格式: {fmt}]") + else: + info_parts.append(f"[格式: {fmt}]") + + print(" ".join(info_parts)) + + if ws.max_row > max_display_rows: + print(f"\n... (还有 {ws.max_row - max_display_rows} 行未显示)") + else: + print("工作表为空") + + print("\n" + "=" * 60) + print("读取完成") + + # 关闭工作簿 + wb.close() + + except Exception as e: + print(f"读取文件时发生错误: {str(e)}") + import traceback + traceback.print_exc() + + +def main(): + """主函数""" + # 默认读取example.xlsx,如果不存在则尝试其他文件 + test_files = [ + "test.xlsx" + ] + + # 查找存在的文件 + file_to_read = None + for file in test_files: + if os.path.exists(file): + file_to_read = file + break + + if file_to_read: + read_xlsx_file(file_to_read) + else: + print("未找到可用的Excel文件") + print("请将Excel文件放在当前目录,或修改main()函数中的文件路径") + print(f"\n当前目录: {os.getcwd()}") + print(f"可用的xlsx文件:") + for root, dirs, files in os.walk('.'): + for file in files: + if file.endswith('.xlsx') and not file.startswith('~$'): + print(f" - {os.path.join(root, file)}") + + +if __name__ == "__main__": + main() +```