V0521 11:53:00.355000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_logging/structured.py:27] {"str": ["/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/__run_lpar_main__.py", 0]}
V0521 11:53:00.357000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_logging/structured.py:27] {"str": ["/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/__par__/meta_only/bootstrap.py", 1]}
V0521 11:53:00.358000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_logging/structured.py:27] {"str": ["/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/__par__/bootstrap.py", 2]}
V0521 11:53:00.360000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_logging/structured.py:27] {"str": ["/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/scripts/shangdiy/aot.py", 3]}
V0521 11:53:00.361000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_logging/structured.py:27] {"str": ["/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/export/__init__.py", 4]}
V0521 11:53:00.362000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_logging/structured.py:27] {"str": ["/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/export/_trace.py", 5]}
V0521 11:53:00.363000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_logging/structured.py:27] {"str": ["/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/export/exported_program.py", 6]}
V0521 11:53:00.364000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_logging/structured.py:27] {"str": ["/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/_export/non_strict_utils.py", 7]}
V0521 11:53:00.366000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_logging/structured.py:27] {"str": ["/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/utils/_pytree.py", 8]}
V0521 11:53:00.367000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_logging/structured.py:27] {"str": ["/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/_subclasses/fake_tensor.py", 9]}
V0521 11:53:00.368000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_logging/structured.py:27] {"str": ["/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/_subclasses/meta_utils.py", 10]}
V0521 11:53:00.369000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_subclasses/meta_utils.py:271] {"describe_storage": {"id": 0, "describer_id": 0, "size": 320}, "stack": [{"line": 38, "name": "<module>", "filename": 0, "loc": "__invoke_main()"}, {"line": 35, "name": "__invoke_main", "filename": 0, "loc": "run_as_main(module, main_function)"}, {"line": 98, "name": "run_as_main", "filename": 1, "loc": "oss_run_as_main("}, {"line": 94, "name": "run_as_main", "filename": 2, "loc": "main()"}, {"line": 58, "name": "main", "filename": 3, "loc": "ep = torch.export.export(model, example_inputs)"}, {"line": 286, "name": "export", "filename": 4, "loc": "return _export("}, {"line": 1125, "name": "wrapper", "filename": 5, "loc": "ep = fn(*args, **kwargs)"}, {"line": 123, "name": "wrapper", "filename": 6, "loc": "return fn(*args, **kwargs)"}, {"line": 2172, "name": "_export", "filename": 5, "loc": "ep = _export_for_training("}, {"line": 1125, "name": "wrapper", "filename": 5, "loc": "ep = fn(*args, **kwargs)"}, {"line": 123, "name": "wrapper", "filename": 6, "loc": "return fn(*args, **kwargs)"}, {"line": 2033, "name": "_export_for_training", "filename": 5, "loc": "export_artifact = export_func("}, {"line": 1933, "name": "_non_strict_export", "filename": 5, "loc": ") = make_fake_inputs("}, {"line": 345, "name": "make_fake_inputs", "filename": 7, "loc": "fake_args, fake_kwargs = tree_map_with_path("}, {"line": 2061, "name": "tree_map_with_path", "filename": 8, "loc": "return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves))"}, {"line": 1193, "name": "unflatten", "filename": 8, "loc": "leaves = list(leaves)"}, {"line": 2061, "name": "<genexpr>", "filename": 8, "loc": "return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves))"}, {"line": 346, "name": "<lambda>", "filename": 7, "loc": "lambda kp, val: fakify(fake_mode, kp, val, t_constraints, sources),"}, {"line": 160, "name": "fakify", "filename": 7, "loc": "fake = mode.from_tensor(t, source=source, symbolic_context=symbolic_context)"}, {"line": 2919, "name": "from_tensor", "filename": 9, "loc": "return self.fake_tensor_converter.from_real_tensor("}, {"line": 398, "name": "from_real_tensor", "filename": 9, "loc": "out = self.meta_converter("}, {"line": 1889, "name": "__call__", "filename": 10, "loc": "t_desc = self.describer.describe_tensor(t, trace=trace)"}, {"line": 311, "name": "describe_tensor", "filename": 10, "loc": "storage = self.describe_storage(t.untyped_storage(), trace=trace)"}, {"line": 271, "name": "describe_storage", "filename": 10, "loc": "trace_structured("}]}
V0521 11:53:00.371000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_subclasses/meta_utils.py:486] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [8, 10], "is_leaf": true, "stride": [10, 1], "storage": 0, "view_func": "_CustomViewFunc(func=<built-in method _view_func_unsafe of Tensor object at 0x7f2922352df0>)", "describer_id": 0}, "stack": [{"line": 38, "name": "<module>", "filename": 0, "loc": "__invoke_main()"}, {"line": 35, "name": "__invoke_main", "filename": 0, "loc": "run_as_main(module, main_function)"}, {"line": 98, "name": "run_as_main", "filename": 1, "loc": "oss_run_as_main("}, {"line": 94, "name": "run_as_main", "filename": 2, "loc": "main()"}, {"line": 58, "name": "main", "filename": 3, "loc": "ep = torch.export.export(model, example_inputs)"}, {"line": 286, "name": "export", "filename": 4, "loc": "return _export("}, {"line": 1125, "name": "wrapper", "filename": 5, "loc": "ep = fn(*args, **kwargs)"}, {"line": 123, "name": "wrapper", "filename": 6, "loc": "return fn(*args, **kwargs)"}, {"line": 2172, "name": "_export", "filename": 5, "loc": "ep = _export_for_training("}, {"line": 1125, "name": "wrapper", "filename": 5, "loc": "ep = fn(*args, **kwargs)"}, {"line": 123, "name": "wrapper", "filename": 6, "loc": "return fn(*args, **kwargs)"}, {"line": 2033, "name": "_export_for_training", "filename": 5, "loc": "export_artifact = export_func("}, {"line": 1933, "name": "_non_strict_export", "filename": 5, "loc": ") = make_fake_inputs("}, {"line": 345, "name": "make_fake_inputs", "filename": 7, "loc": "fake_args, fake_kwargs = tree_map_with_path("}, {"line": 2061, "name": "tree_map_with_path", "filename": 8, "loc": "return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves))"}, {"line": 1193, "name": "unflatten", "filename": 8, "loc": "leaves = list(leaves)"}, {"line": 2061, "name": "<genexpr>", "filename": 8, "loc": "return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves))"}, {"line": 346, "name": "<lambda>", "filename": 7, "loc": "lambda kp, val: fakify(fake_mode, kp, val, t_constraints, sources),"}, {"line": 160, "name": "fakify", "filename": 7, "loc": "fake = mode.from_tensor(t, source=source, symbolic_context=symbolic_context)"}, {"line": 2919, "name": "from_tensor", "filename": 9, "loc": "return self.fake_tensor_converter.from_real_tensor("}, {"line": 398, "name": "from_real_tensor", "filename": 9, "loc": "out = self.meta_converter("}, {"line": 1889, "name": "__call__", "filename": 10, "loc": "t_desc = self.describer.describe_tensor(t, trace=trace)"}, {"line": 486, "name": "describe_tensor", "filename": 10, "loc": "trace_structured("}]}
V0521 11:53:00.373000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_subclasses/meta_utils.py:1893] {"describe_source": {"describer_id": 0, "id": 0, "source": "L['args'][0][0]"}, "stack": [{"line": 38, "name": "<module>", "filename": 0, "loc": "__invoke_main()"}, {"line": 35, "name": "__invoke_main", "filename": 0, "loc": "run_as_main(module, main_function)"}, {"line": 98, "name": "run_as_main", "filename": 1, "loc": "oss_run_as_main("}, {"line": 94, "name": "run_as_main", "filename": 2, "loc": "main()"}, {"line": 58, "name": "main", "filename": 3, "loc": "ep = torch.export.export(model, example_inputs)"}, {"line": 286, "name": "export", "filename": 4, "loc": "return _export("}, {"line": 1125, "name": "wrapper", "filename": 5, "loc": "ep = fn(*args, **kwargs)"}, {"line": 123, "name": "wrapper", "filename": 6, "loc": "return fn(*args, **kwargs)"}, {"line": 2172, "name": "_export", "filename": 5, "loc": "ep = _export_for_training("}, {"line": 1125, "name": "wrapper", "filename": 5, "loc": "ep = fn(*args, **kwargs)"}, {"line": 123, "name": "wrapper", "filename": 6, "loc": "return fn(*args, **kwargs)"}, {"line": 2033, "name": "_export_for_training", "filename": 5, "loc": "export_artifact = export_func("}, {"line": 1933, "name": "_non_strict_export", "filename": 5, "loc": ") = make_fake_inputs("}, {"line": 345, "name": "make_fake_inputs", "filename": 7, "loc": "fake_args, fake_kwargs = tree_map_with_path("}, {"line": 2061, "name": "tree_map_with_path", "filename": 8, "loc": "return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves))"}, {"line": 1193, "name": "unflatten", "filename": 8, "loc": "leaves = list(leaves)"}, {"line": 2061, "name": "<genexpr>", "filename": 8, "loc": "return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves))"}, {"line": 346, "name": "<lambda>", "filename": 7, "loc": "lambda kp, val: fakify(fake_mode, kp, val, t_constraints, sources),"}, {"line": 160, "name": "fakify", "filename": 7, "loc": "fake = mode.from_tensor(t, source=source, symbolic_context=symbolic_context)"}, {"line": 2919, "name": "from_tensor", "filename": 9, "loc": "return self.fake_tensor_converter.from_real_tensor("}, {"line": 398, "name": "from_real_tensor", "filename": 9, "loc": "out = self.meta_converter("}, {"line": 1893, "name": "__call__", "filename": 10, "loc": "trace_structured("}]}
V0521 11:53:00.376000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_subclasses/meta_utils.py:271] {"describe_storage": {"id": 1, "describer_id": 0, "size": 800}, "stack": [{"line": 38, "name": "<module>", "filename": 0, "loc": "__invoke_main()"}, {"line": 35, "name": "__invoke_main", "filename": 0, "loc": "run_as_main(module, main_function)"}, {"line": 98, "name": "run_as_main", "filename": 1, "loc": "oss_run_as_main("}, {"line": 94, "name": "run_as_main", "filename": 2, "loc": "main()"}, {"line": 58, "name": "main", "filename": 3, "loc": "ep = torch.export.export(model, example_inputs)"}, {"line": 286, "name": "export", "filename": 4, "loc": "return _export("}, {"line": 1125, "name": "wrapper", "filename": 5, "loc": "ep = fn(*args, **kwargs)"}, {"line": 123, "name": "wrapper", "filename": 6, "loc": "return fn(*args, **kwargs)"}, {"line": 2172, "name": "_export", "filename": 5, "loc": "ep = _export_for_training("}, {"line": 1125, "name": "wrapper", "filename": 5, "loc": "ep = fn(*args, **kwargs)"}, {"line": 123, "name": "wrapper", "filename": 6, "loc": "return fn(*args, **kwargs)"}, {"line": 2033, "name": "_export_for_training", "filename": 5, "loc": "export_artifact = export_func("}, {"line": 1933, "name": "_non_strict_export", "filename": 5, "loc": ") = make_fake_inputs("}, {"line": 345, "name": "make_fake_inputs", "filename": 7, "loc": "fake_args, fake_kwargs = tree_map_with_path("}, {"line": 2061, "name": "tree_map_with_path", "filename": 8, "loc": "return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves))"}, {"line": 1193, "name": "unflatten", "filename": 8, "loc": "leaves = list(leaves)"}, {"line": 2061, "name": "<genexpr>", "filename": 8, "loc": "return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves))"}, {"line": 346, "name": "<lambda>", "filename": 7, "loc": "lambda kp, val: fakify(fake_mode, kp, val, t_constraints, sources),"}, {"line": 160, "name": "fakify", "filename": 7, "loc": "fake = mode.from_tensor(t, source=source, symbolic_context=symbolic_context)"}, {"line": 2919, "name": "from_tensor", "filename": 9, "loc": "return self.fake_tensor_converter.from_real_tensor("}, {"line": 398, "name": "from_real_tensor", "filename": 9, "loc": "out = self.meta_converter("}, {"line": 1889, "name": "__call__", "filename": 10, "loc": "t_desc = self.describer.describe_tensor(t, trace=trace)"}, {"line": 311, "name": "describe_tensor", "filename": 10, "loc": "storage = self.describe_storage(t.untyped_storage(), trace=trace)"}, {"line": 271, "name": "describe_storage", "filename": 10, "loc": "trace_structured("}]}
V0521 11:53:00.378000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_subclasses/meta_utils.py:486] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [10, 20], "is_leaf": true, "stride": [20, 1], "storage": 1, "view_func": "_CustomViewFunc(func=<built-in method _view_func_unsafe of Tensor object at 0x7f2922351fe0>)", "describer_id": 0}, "stack": [{"line": 38, "name": "<module>", "filename": 0, "loc": "__invoke_main()"}, {"line": 35, "name": "__invoke_main", "filename": 0, "loc": "run_as_main(module, main_function)"}, {"line": 98, "name": "run_as_main", "filename": 1, "loc": "oss_run_as_main("}, {"line": 94, "name": "run_as_main", "filename": 2, "loc": "main()"}, {"line": 58, "name": "main", "filename": 3, "loc": "ep = torch.export.export(model, example_inputs)"}, {"line": 286, "name": "export", "filename": 4, "loc": "return _export("}, {"line": 1125, "name": "wrapper", "filename": 5, "loc": "ep = fn(*args, **kwargs)"}, {"line": 123, "name": "wrapper", "filename": 6, "loc": "return fn(*args, **kwargs)"}, {"line": 2172, "name": "_export", "filename": 5, "loc": "ep = _export_for_training("}, {"line": 1125, "name": "wrapper", "filename": 5, "loc": "ep = fn(*args, **kwargs)"}, {"line": 123, "name": "wrapper", "filename": 6, "loc": "return fn(*args, **kwargs)"}, {"line": 2033, "name": "_export_for_training", "filename": 5, "loc": "export_artifact = export_func("}, {"line": 1933, "name": "_non_strict_export", "filename": 5, "loc": ") = make_fake_inputs("}, {"line": 345, "name": "make_fake_inputs", "filename": 7, "loc": "fake_args, fake_kwargs = tree_map_with_path("}, {"line": 2061, "name": "tree_map_with_path", "filename": 8, "loc": "return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves))"}, {"line": 1193, "name": "unflatten", "filename": 8, "loc": "leaves = list(leaves)"}, {"line": 2061, "name": "<genexpr>", "filename": 8, "loc": "return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves))"}, {"line": 346, "name": "<lambda>", "filename": 7, "loc": "lambda kp, val: fakify(fake_mode, kp, val, t_constraints, sources),"}, {"line": 160, "name": "fakify", "filename": 7, "loc": "fake = mode.from_tensor(t, source=source, symbolic_context=symbolic_context)"}, {"line": 2919, "name": "from_tensor", "filename": 9, "loc": "return self.fake_tensor_converter.from_real_tensor("}, {"line": 398, "name": "from_real_tensor", "filename": 9, "loc": "out = self.meta_converter("}, {"line": 1889, "name": "__call__", "filename": 10, "loc": "t_desc = self.describer.describe_tensor(t, trace=trace)"}, {"line": 486, "name": "describe_tensor", "filename": 10, "loc": "trace_structured("}]}
V0521 11:53:00.379000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_subclasses/meta_utils.py:1893] {"describe_source": {"describer_id": 0, "id": 1, "source": "L['args'][0][1]"}, "stack": [{"line": 38, "name": "<module>", "filename": 0, "loc": "__invoke_main()"}, {"line": 35, "name": "__invoke_main", "filename": 0, "loc": "run_as_main(module, main_function)"}, {"line": 98, "name": "run_as_main", "filename": 1, "loc": "oss_run_as_main("}, {"line": 94, "name": "run_as_main", "filename": 2, "loc": "main()"}, {"line": 58, "name": "main", "filename": 3, "loc": "ep = torch.export.export(model, example_inputs)"}, {"line": 286, "name": "export", "filename": 4, "loc": "return _export("}, {"line": 1125, "name": "wrapper", "filename": 5, "loc": "ep = fn(*args, **kwargs)"}, {"line": 123, "name": "wrapper", "filename": 6, "loc": "return fn(*args, **kwargs)"}, {"line": 2172, "name": "_export", "filename": 5, "loc": "ep = _export_for_training("}, {"line": 1125, "name": "wrapper", "filename": 5, "loc": "ep = fn(*args, **kwargs)"}, {"line": 123, "name": "wrapper", "filename": 6, "loc": "return fn(*args, **kwargs)"}, {"line": 2033, "name": "_export_for_training", "filename": 5, "loc": "export_artifact = export_func("}, {"line": 1933, "name": "_non_strict_export", "filename": 5, "loc": ") = make_fake_inputs("}, {"line": 345, "name": "make_fake_inputs", "filename": 7, "loc": "fake_args, fake_kwargs = tree_map_with_path("}, {"line": 2061, "name": "tree_map_with_path", "filename": 8, "loc": "return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves))"}, {"line": 1193, "name": "unflatten", "filename": 8, "loc": "leaves = list(leaves)"}, {"line": 2061, "name": "<genexpr>", "filename": 8, "loc": "return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves))"}, {"line": 346, "name": "<lambda>", "filename": 7, "loc": "lambda kp, val: fakify(fake_mode, kp, val, t_constraints, sources),"}, {"line": 160, "name": "fakify", "filename": 7, "loc": "fake = mode.from_tensor(t, source=source, symbolic_context=symbolic_context)"}, {"line": 2919, "name": "from_tensor", "filename": 9, "loc": "return self.fake_tensor_converter.from_real_tensor("}, {"line": 398, "name": "from_real_tensor", "filename": 9, "loc": "out = self.meta_converter("}, {"line": 1893, "name": "__call__", "filename": 10, "loc": "trace_structured("}]}
V0521 11:53:00.381000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_subclasses/meta_utils.py:271] {"describe_storage": {"id": 2, "describer_id": 0, "size": 2400}, "stack": [{"line": 38, "name": "<module>", "filename": 0, "loc": "__invoke_main()"}, {"line": 35, "name": "__invoke_main", "filename": 0, "loc": "run_as_main(module, main_function)"}, {"line": 98, "name": "run_as_main", "filename": 1, "loc": "oss_run_as_main("}, {"line": 94, "name": "run_as_main", "filename": 2, "loc": "main()"}, {"line": 58, "name": "main", "filename": 3, "loc": "ep = torch.export.export(model, example_inputs)"}, {"line": 286, "name": "export", "filename": 4, "loc": "return _export("}, {"line": 1125, "name": "wrapper", "filename": 5, "loc": "ep = fn(*args, **kwargs)"}, {"line": 123, "name": "wrapper", "filename": 6, "loc": "return fn(*args, **kwargs)"}, {"line": 2172, "name": "_export", "filename": 5, "loc": "ep = _export_for_training("}, {"line": 1125, "name": "wrapper", "filename": 5, "loc": "ep = fn(*args, **kwargs)"}, {"line": 123, "name": "wrapper", "filename": 6, "loc": "return fn(*args, **kwargs)"}, {"line": 2033, "name": "_export_for_training", "filename": 5, "loc": "export_artifact = export_func("}, {"line": 1933, "name": "_non_strict_export", "filename": 5, "loc": ") = make_fake_inputs("}, {"line": 345, "name": "make_fake_inputs", "filename": 7, "loc": "fake_args, fake_kwargs = tree_map_with_path("}, {"line": 2061, "name": "tree_map_with_path", "filename": 8, "loc": "return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves))"}, {"line": 1193, "name": "unflatten", "filename": 8, "loc": "leaves = list(leaves)"}, {"line": 2061, "name": "<genexpr>", "filename": 8, "loc": "return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves))"}, {"line": 346, "name": "<lambda>", "filename": 7, "loc": "lambda kp, val: fakify(fake_mode, kp, val, t_constraints, sources),"}, {"line": 160, "name": "fakify", "filename": 7, "loc": "fake = mode.from_tensor(t, source=source, symbolic_context=symbolic_context)"}, {"line": 2919, "name": "from_tensor", "filename": 9, "loc": "return self.fake_tensor_converter.from_real_tensor("}, {"line": 398, "name": "from_real_tensor", "filename": 9, "loc": "out = self.meta_converter("}, {"line": 1889, "name": "__call__", "filename": 10, "loc": "t_desc = self.describer.describe_tensor(t, trace=trace)"}, {"line": 311, "name": "describe_tensor", "filename": 10, "loc": "storage = self.describe_storage(t.untyped_storage(), trace=trace)"}, {"line": 271, "name": "describe_storage", "filename": 10, "loc": "trace_structured("}]}
V0521 11:53:00.383000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_subclasses/meta_utils.py:486] {"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [20, 30], "is_leaf": true, "stride": [30, 1], "storage": 2, "view_func": "_CustomViewFunc(func=<built-in method _view_func_unsafe of Tensor object at 0x7f2922352990>)", "describer_id": 0}, "stack": [{"line": 38, "name": "<module>", "filename": 0, "loc": "__invoke_main()"}, {"line": 35, "name": "__invoke_main", "filename": 0, "loc": "run_as_main(module, main_function)"}, {"line": 98, "name": "run_as_main", "filename": 1, "loc": "oss_run_as_main("}, {"line": 94, "name": "run_as_main", "filename": 2, "loc": "main()"}, {"line": 58, "name": "main", "filename": 3, "loc": "ep = torch.export.export(model, example_inputs)"}, {"line": 286, "name": "export", "filename": 4, "loc": "return _export("}, {"line": 1125, "name": "wrapper", "filename": 5, "loc": "ep = fn(*args, **kwargs)"}, {"line": 123, "name": "wrapper", "filename": 6, "loc": "return fn(*args, **kwargs)"}, {"line": 2172, "name": "_export", "filename": 5, "loc": "ep = _export_for_training("}, {"line": 1125, "name": "wrapper", "filename": 5, "loc": "ep = fn(*args, **kwargs)"}, {"line": 123, "name": "wrapper", "filename": 6, "loc": "return fn(*args, **kwargs)"}, {"line": 2033, "name": "_export_for_training", "filename": 5, "loc": "export_artifact = export_func("}, {"line": 1933, "name": "_non_strict_export", "filename": 5, "loc": ") = make_fake_inputs("}, {"line": 345, "name": "make_fake_inputs", "filename": 7, "loc": "fake_args, fake_kwargs = tree_map_with_path("}, {"line": 2061, "name": "tree_map_with_path", "filename": 8, "loc": "return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves))"}, {"line": 1193, "name": "unflatten", "filename": 8, "loc": "leaves = list(leaves)"}, {"line": 2061, "name": "<genexpr>", "filename": 8, "loc": "return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves))"}, {"line": 346, "name": "<lambda>", "filename": 7, "loc": "lambda kp, val: fakify(fake_mode, kp, val, t_constraints, sources),"}, {"line": 160, "name": "fakify", "filename": 7, "loc": "fake = mode.from_tensor(t, source=source, symbolic_context=symbolic_context)"}, {"line": 2919, "name": "from_tensor", "filename": 9, "loc": "return self.fake_tensor_converter.from_real_tensor("}, {"line": 398, "name": "from_real_tensor", "filename": 9, "loc": "out = self.meta_converter("}, {"line": 1889, "name": "__call__", "filename": 10, "loc": "t_desc = self.describer.describe_tensor(t, trace=trace)"}, {"line": 486, "name": "describe_tensor", "filename": 10, "loc": "trace_structured("}]}
V0521 11:53:00.385000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_subclasses/meta_utils.py:1893] {"describe_source": {"describer_id": 0, "id": 2, "source": "L['args'][0][2]"}, "stack": [{"line": 38, "name": "<module>", "filename": 0, "loc": "__invoke_main()"}, {"line": 35, "name": "__invoke_main", "filename": 0, "loc": "run_as_main(module, main_function)"}, {"line": 98, "name": "run_as_main", "filename": 1, "loc": "oss_run_as_main("}, {"line": 94, "name": "run_as_main", "filename": 2, "loc": "main()"}, {"line": 58, "name": "main", "filename": 3, "loc": "ep = torch.export.export(model, example_inputs)"}, {"line": 286, "name": "export", "filename": 4, "loc": "return _export("}, {"line": 1125, "name": "wrapper", "filename": 5, "loc": "ep = fn(*args, **kwargs)"}, {"line": 123, "name": "wrapper", "filename": 6, "loc": "return fn(*args, **kwargs)"}, {"line": 2172, "name": "_export", "filename": 5, "loc": "ep = _export_for_training("}, {"line": 1125, "name": "wrapper", "filename": 5, "loc": "ep = fn(*args, **kwargs)"}, {"line": 123, "name": "wrapper", "filename": 6, "loc": "return fn(*args, **kwargs)"}, {"line": 2033, "name": "_export_for_training", "filename": 5, "loc": "export_artifact = export_func("}, {"line": 1933, "name": "_non_strict_export", "filename": 5, "loc": ") = make_fake_inputs("}, {"line": 345, "name": "make_fake_inputs", "filename": 7, "loc": "fake_args, fake_kwargs = tree_map_with_path("}, {"line": 2061, "name": "tree_map_with_path", "filename": 8, "loc": "return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves))"}, {"line": 1193, "name": "unflatten", "filename": 8, "loc": "leaves = list(leaves)"}, {"line": 2061, "name": "<genexpr>", "filename": 8, "loc": "return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves))"}, {"line": 346, "name": "<lambda>", "filename": 7, "loc": "lambda kp, val: fakify(fake_mode, kp, val, t_constraints, sources),"}, {"line": 160, "name": "fakify", "filename": 7, "loc": "fake = mode.from_tensor(t, source=source, symbolic_context=symbolic_context)"}, {"line": 2919, "name": "from_tensor", "filename": 9, "loc": "return self.fake_tensor_converter.from_real_tensor("}, {"line": 398, "name": "from_real_tensor", "filename": 9, "loc": "out = self.meta_converter("}, {"line": 1893, "name": "__call__", "filename": 10, "loc": "trace_structured("}]}
V0521 11:53:00.387000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_subclasses/meta_utils.py:271] {"describe_storage": {"id": 3, "describer_id": 0, "size": 1200}, "stack": [{"line": 38, "name": "<module>", "filename": 0, "loc": "__invoke_main()"}, {"line": 35, "name": "__invoke_main", "filename": 0, "loc": "run_as_main(module, main_function)"}, {"line": 98, "name": "run_as_main", "filename": 1, "loc": "oss_run_as_main("}, {"line": 94, "name": "run_as_main", "filename": 2, "loc": "main()"}, {"line": 58, "name": "main", "filename": 3, "loc": "ep = torch.export.export(model, example_inputs)"}, {"line": 286, "name": "export", "filename": 4, "loc": "return _export("}, {"line": 1125, "name": "wrapper", "filename": 5, "loc": "ep = fn(*args, **kwargs)"}, {"line": 123, "name": "wrapper", "filename": 6, "loc": "return fn(*args, **kwargs)"}, {"line": 2172, "name": "_export", "filename": 5, "loc": "ep = _export_for_training("}, {"line": 1125, "name": "wrapper", "filename": 5, "loc": "ep = fn(*args, **kwargs)"}, {"line": 123, "name": "wrapper", "filename": 6, "loc": "return fn(*args, **kwargs)"}, {"line": 2033, "name": "_export_for_training", "filename": 5, "loc": "export_artifact = export_func("}, {"line": 1933, "name": "_non_strict_export", "filename": 5, "loc": ") = make_fake_inputs("}, {"line": 345, "name": "make_fake_inputs", "filename": 7, "loc": "fake_args, fake_kwargs = tree_map_with_path("}, {"line": 2061, "name": "tree_map_with_path", "filename": 8, "loc": "return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves))"}, {"line": 1193, "name": "unflatten", "filename": 8, "loc": "leaves = list(leaves)"}, {"line": 2061, "name": "<genexpr>", "filename": 8, "loc": "return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves))"}, {"line": 346, "name": "<lambda>", "filename": 7, "loc": "lambda kp, val: fakify(fake_mode, kp, val, t_constraints, sources),"}, {"line": 160, "name": "fakify", "filename": 7, "loc": "fake = mode.from_tensor(t, source=source, symbolic_context=symbolic_context)"}, {"line": 2919, "name": "from_tensor", "filename": 9, "loc": "return self.fake_tensor_converter.from_real_tensor("}, {"line": 398, "name": "from_real_tensor", "filename": 9, "loc": "out = self.meta_converter("}, {"line": 1889, "name": "__call__", "filename": 10, "loc": "t_desc = self.describer.describe_tensor(t, trace=trace)"}, {"line": 311, "name": "describe_tensor", "filename": 10, "loc": "storage = self.describe_storage(t.untyped_storage(), trace=trace)"}, {"line": 271, "name": "describe_storage", "filename": 10, "loc": "trace_structured("}]}
V0521 11:53:00.388000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_subclasses/meta_utils.py:486] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [10, 30], "is_leaf": true, "stride": [30, 1], "storage": 3, "view_func": "_CustomViewFunc(func=<built-in method _view_func_unsafe of Tensor object at 0x7f2922352fd0>)", "describer_id": 0}, "stack": [{"line": 38, "name": "<module>", "filename": 0, "loc": "__invoke_main()"}, {"line": 35, "name": "__invoke_main", "filename": 0, "loc": "run_as_main(module, main_function)"}, {"line": 98, "name": "run_as_main", "filename": 1, "loc": "oss_run_as_main("}, {"line": 94, "name": "run_as_main", "filename": 2, "loc": "main()"}, {"line": 58, "name": "main", "filename": 3, "loc": "ep = torch.export.export(model, example_inputs)"}, {"line": 286, "name": "export", "filename": 4, "loc": "return _export("}, {"line": 1125, "name": "wrapper", "filename": 5, "loc": "ep = fn(*args, **kwargs)"}, {"line": 123, "name": "wrapper", "filename": 6, "loc": "return fn(*args, **kwargs)"}, {"line": 2172, "name": "_export", "filename": 5, "loc": "ep = _export_for_training("}, {"line": 1125, "name": "wrapper", "filename": 5, "loc": "ep = fn(*args, **kwargs)"}, {"line": 123, "name": "wrapper", "filename": 6, "loc": "return fn(*args, **kwargs)"}, {"line": 2033, "name": "_export_for_training", "filename": 5, "loc": "export_artifact = export_func("}, {"line": 1933, "name": "_non_strict_export", "filename": 5, "loc": ") = make_fake_inputs("}, {"line": 345, "name": "make_fake_inputs", "filename": 7, "loc": "fake_args, fake_kwargs = tree_map_with_path("}, {"line": 2061, "name": "tree_map_with_path", "filename": 8, "loc": "return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves))"}, {"line": 1193, "name": "unflatten", "filename": 8, "loc": "leaves = list(leaves)"}, {"line": 2061, "name": "<genexpr>", "filename": 8, "loc": "return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves))"}, {"line": 346, "name": "<lambda>", "filename": 7, "loc": "lambda kp, val: fakify(fake_mode, kp, val, t_constraints, sources),"}, {"line": 160, "name": "fakify", "filename": 7, "loc": "fake = mode.from_tensor(t, source=source, symbolic_context=symbolic_context)"}, {"line": 2919, "name": "from_tensor", "filename": 9, "loc": "return self.fake_tensor_converter.from_real_tensor("}, {"line": 398, "name": "from_real_tensor", "filename": 9, "loc": "out = self.meta_converter("}, {"line": 1889, "name": "__call__", "filename": 10, "loc": "t_desc = self.describer.describe_tensor(t, trace=trace)"}, {"line": 486, "name": "describe_tensor", "filename": 10, "loc": "trace_structured("}]}
V0521 11:53:00.390000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_subclasses/meta_utils.py:1893] {"describe_source": {"describer_id": 0, "id": 3, "source": "L['args'][0][3]"}, "stack": [{"line": 38, "name": "<module>", "filename": 0, "loc": "__invoke_main()"}, {"line": 35, "name": "__invoke_main", "filename": 0, "loc": "run_as_main(module, main_function)"}, {"line": 98, "name": "run_as_main", "filename": 1, "loc": "oss_run_as_main("}, {"line": 94, "name": "run_as_main", "filename": 2, "loc": "main()"}, {"line": 58, "name": "main", "filename": 3, "loc": "ep = torch.export.export(model, example_inputs)"}, {"line": 286, "name": "export", "filename": 4, "loc": "return _export("}, {"line": 1125, "name": "wrapper", "filename": 5, "loc": "ep = fn(*args, **kwargs)"}, {"line": 123, "name": "wrapper", "filename": 6, "loc": "return fn(*args, **kwargs)"}, {"line": 2172, "name": "_export", "filename": 5, "loc": "ep = _export_for_training("}, {"line": 1125, "name": "wrapper", "filename": 5, "loc": "ep = fn(*args, **kwargs)"}, {"line": 123, "name": "wrapper", "filename": 6, "loc": "return fn(*args, **kwargs)"}, {"line": 2033, "name": "_export_for_training", "filename": 5, "loc": "export_artifact = export_func("}, {"line": 1933, "name": "_non_strict_export", "filename": 5, "loc": ") = make_fake_inputs("}, {"line": 345, "name": "make_fake_inputs", "filename": 7, "loc": "fake_args, fake_kwargs = tree_map_with_path("}, {"line": 2061, "name": "tree_map_with_path", "filename": 8, "loc": "return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves))"}, {"line": 1193, "name": "unflatten", "filename": 8, "loc": "leaves = list(leaves)"}, {"line": 2061, "name": "<genexpr>", "filename": 8, "loc": "return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves))"}, {"line": 346, "name": "<lambda>", "filename": 7, "loc": "lambda kp, val: fakify(fake_mode, kp, val, t_constraints, sources),"}, {"line": 160, "name": "fakify", "filename": 7, "loc": "fake = mode.from_tensor(t, source=source, symbolic_context=symbolic_context)"}, {"line": 2919, "name": "from_tensor", "filename": 9, "loc": "return self.fake_tensor_converter.from_real_tensor("}, {"line": 398, "name": "from_real_tensor", "filename": 9, "loc": "out = self.meta_converter("}, {"line": 1893, "name": "__call__", "filename": 10, "loc": "trace_structured("}]}
V0521 11:53:02.682000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "507ea14d7e47092d1f061b81d3b50038"}
	{
	"name": "compile_fx_aot",
	"ts": 1747853582682082.0,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:02.688000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_logging/structured.py:27] {"str": ["/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/_inductor/__init__.py", 11]}
V0521 11:53:02.690000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_logging/structured.py:27] {"str": ["/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/_inductor/debug.py", 12]}
V0521 11:53:02.691000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_logging/structured.py:27] {"str": ["/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/_inductor/compile_fx.py", 13]}
V0521 11:53:02.693000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_inductor/compile_fx.py:2045] {"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "stack": [{"line": 38, "name": "<module>", "filename": 0, "loc": "__invoke_main()"}, {"line": 35, "name": "__invoke_main", "filename": 0, "loc": "run_as_main(module, main_function)"}, {"line": 98, "name": "run_as_main", "filename": 1, "loc": "oss_run_as_main("}, {"line": 94, "name": "run_as_main", "filename": 2, "loc": "main()"}, {"line": 59, "name": "main", "filename": 3, "loc": "package_path = torch._inductor.aoti_compile_and_package(ep)"}, {"line": 150, "name": "aoti_compile_and_package", "filename": 11, "loc": "return aot_inductor_minifier_wrapper("}, {"line": 929, "name": "aot_inductor_minifier_wrapper", "filename": 12, "loc": "return func("}, {"line": 193, "name": "_aoti_compile_and_package_inner", "filename": 11, "loc": "aoti_files = aot_compile(gm, args, kwargs, options=inductor_configs)"}, {"line": 300, "name": "aot_compile", "filename": 11, "loc": "return compile_fx_aot("}, {"line": 1745, "name": "compile_fx_aot", "filename": 13, "loc": "compiled_artifacts = compile_fx("}, {"line": 1943, "name": "compile_fx", "filename": 13, "loc": "return compile_fx("}, {"line": 2000, "name": "compile_fx", "filename": 13, "loc": "return compile_fx("}, {"line": 2045, "name": "compile_fx", "filename": 13, "loc": "trace_structured("}], "has_payload": "81a75ece5c64a299bd2677dbf51edce7"}
	class GraphModule(torch.nn.Module):
	    def forward(self, x: "f32[8, 10][10, 1]cuda:0", a: "f32[10, 20][20, 1]cuda:0", b: "f32[20, 30][30, 1]cuda:0", c: "f32[10, 30][30, 1]cuda:0"):
	        # No stacktrace found for following nodes
	        fc1_weight: "f32[16, 10][10, 1]cuda:0" = self.fc1.weight
	        fc1_bias: "f32[16][1]cuda:0" = self.fc1.bias
	        
	         # File: /data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/nn/modules/linear.py:125 in forward, code: return F.linear(input, self.weight, self.bias)
	        linear: "f32[8, 16][16, 1]cuda:0" = torch.ops.aten.linear.default(x, fc1_weight, fc1_bias);  x = fc1_weight = fc1_bias = None
	        
	         # File: /data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/nn/modules/activation.py:135 in forward, code: return F.relu(input, inplace=self.inplace)
	        relu: "f32[8, 16][16, 1]cuda:0" = torch.ops.aten.relu.default(linear);  linear = None
	        
	         # File: /data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/nn/modules/activation.py:329 in forward, code: return torch.sigmoid(input)
	        sigmoid: "f32[8, 16][16, 1]cuda:0" = torch.ops.aten.sigmoid.default(relu);  relu = None
	        
	         # File: /data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/scripts/shangdiy/aot.py:32 in forward, code: d = a * 3.14
	        mul: "f32[10, 20][20, 1]cuda:0" = torch.ops.aten.mul.Tensor(a, 3.14);  a = None
	        
	         # File: /data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/scripts/shangdiy/aot.py:33 in forward, code: y = torch.addmm(c, d, b)
	        addmm: "f32[10, 30][30, 1]cuda:0" = torch.ops.aten.addmm.default(c, mul, b);  c = mul = b = None
	        
	         # File: /data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/scripts/shangdiy/aot.py:34 in forward, code: z = torch.nn.functional.gelu(y)
	        gelu: "f32[10, 30][30, 1]cuda:0" = torch.ops.aten.gelu.default(addmm);  addmm = None
	        return (sigmoid, gelu)
	        
	
	 # graph id: 139811696436544
V0521 11:53:02.709000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "f8f6927e313045ce45a02d53aa410653"}
	{
	"name": "_recursive_pre_grad_passes",
	"ts": 1747853582709631.5,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:03.415000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "84d9c24eccff81dbe891f1c6e19c3e29"}
	{
	"name": "_recursive_pre_grad_passes",
	"ts": 1747853583415742.8,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:03.422000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_inductor/compile_fx.py:2069] {"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "stack": [{"line": 38, "name": "<module>", "filename": 0, "loc": "__invoke_main()"}, {"line": 35, "name": "__invoke_main", "filename": 0, "loc": "run_as_main(module, main_function)"}, {"line": 98, "name": "run_as_main", "filename": 1, "loc": "oss_run_as_main("}, {"line": 94, "name": "run_as_main", "filename": 2, "loc": "main()"}, {"line": 59, "name": "main", "filename": 3, "loc": "package_path = torch._inductor.aoti_compile_and_package(ep)"}, {"line": 150, "name": "aoti_compile_and_package", "filename": 11, "loc": "return aot_inductor_minifier_wrapper("}, {"line": 929, "name": "aot_inductor_minifier_wrapper", "filename": 12, "loc": "return func("}, {"line": 193, "name": "_aoti_compile_and_package_inner", "filename": 11, "loc": "aoti_files = aot_compile(gm, args, kwargs, options=inductor_configs)"}, {"line": 300, "name": "aot_compile", "filename": 11, "loc": "return compile_fx_aot("}, {"line": 1745, "name": "compile_fx_aot", "filename": 13, "loc": "compiled_artifacts = compile_fx("}, {"line": 1943, "name": "compile_fx", "filename": 13, "loc": "return compile_fx("}, {"line": 2000, "name": "compile_fx", "filename": 13, "loc": "return compile_fx("}, {"line": 2069, "name": "compile_fx", "filename": 13, "loc": "trace_structured("}], "has_payload": "81a75ece5c64a299bd2677dbf51edce7"}
	class GraphModule(torch.nn.Module):
	    def forward(self, x: "f32[8, 10][10, 1]cuda:0", a: "f32[10, 20][20, 1]cuda:0", b: "f32[20, 30][30, 1]cuda:0", c: "f32[10, 30][30, 1]cuda:0"):
	        # No stacktrace found for following nodes
	        fc1_weight: "f32[16, 10][10, 1]cuda:0" = self.fc1.weight
	        fc1_bias: "f32[16][1]cuda:0" = self.fc1.bias
	        
	         # File: /data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/nn/modules/linear.py:125 in forward, code: return F.linear(input, self.weight, self.bias)
	        linear: "f32[8, 16][16, 1]cuda:0" = torch.ops.aten.linear.default(x, fc1_weight, fc1_bias);  x = fc1_weight = fc1_bias = None
	        
	         # File: /data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/nn/modules/activation.py:135 in forward, code: return F.relu(input, inplace=self.inplace)
	        relu: "f32[8, 16][16, 1]cuda:0" = torch.ops.aten.relu.default(linear);  linear = None
	        
	         # File: /data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/nn/modules/activation.py:329 in forward, code: return torch.sigmoid(input)
	        sigmoid: "f32[8, 16][16, 1]cuda:0" = torch.ops.aten.sigmoid.default(relu);  relu = None
	        
	         # File: /data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/scripts/shangdiy/aot.py:32 in forward, code: d = a * 3.14
	        mul: "f32[10, 20][20, 1]cuda:0" = torch.ops.aten.mul.Tensor(a, 3.14);  a = None
	        
	         # File: /data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/scripts/shangdiy/aot.py:33 in forward, code: y = torch.addmm(c, d, b)
	        addmm: "f32[10, 30][30, 1]cuda:0" = torch.ops.aten.addmm.default(c, mul, b);  c = mul = b = None
	        
	         # File: /data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/scripts/shangdiy/aot.py:34 in forward, code: z = torch.nn.functional.gelu(y)
	        gelu: "f32[10, 30][30, 1]cuda:0" = torch.ops.aten.gelu.default(addmm);  addmm = None
	        return (sigmoid, gelu)
	        
	
	 # graph id: 139811696436544
V0521 11:53:03.426000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "11c757a7b0cd9bc1bd38f6018fbfd2a3"}
	{
	"name": "create_aot_dispatcher_function",
	"ts": 1747853583426788.5,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:03.434000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "e5e162e1e8a2609690595448d5d6c367"}
	{
	"name": "aot_collect_metadata",
	"ts": 1747853583434347.2,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:03.458000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "79f1111879402829a5892b2b940a6123"}
	{
	"name": "aot_collect_metadata",
	"ts": 1747853583458375.8,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:03.506000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_logging/structured.py:27] {"str": ["/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/_functorch/aot_autograd.py", 14]}
V0521 11:53:03.507000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_logging/structured.py:27] {"str": ["/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py", 15]}
V0521 11:53:03.509000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_logging/structured.py:27] {"str": ["/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py", 16]}
V0521 11:53:03.511000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:214] {"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "stack": [{"line": 38, "name": "<module>", "filename": 0, "loc": "__invoke_main()"}, {"line": 35, "name": "__invoke_main", "filename": 0, "loc": "run_as_main(module, main_function)"}, {"line": 98, "name": "run_as_main", "filename": 1, "loc": "oss_run_as_main("}, {"line": 94, "name": "run_as_main", "filename": 2, "loc": "main()"}, {"line": 59, "name": "main", "filename": 3, "loc": "package_path = torch._inductor.aoti_compile_and_package(ep)"}, {"line": 150, "name": "aoti_compile_and_package", "filename": 11, "loc": "return aot_inductor_minifier_wrapper("}, {"line": 929, "name": "aot_inductor_minifier_wrapper", "filename": 12, "loc": "return func("}, {"line": 193, "name": "_aoti_compile_and_package_inner", "filename": 11, "loc": "aoti_files = aot_compile(gm, args, kwargs, options=inductor_configs)"}, {"line": 300, "name": "aot_compile", "filename": 11, "loc": "return compile_fx_aot("}, {"line": 1745, "name": "compile_fx_aot", "filename": 13, "loc": "compiled_artifacts = compile_fx("}, {"line": 1943, "name": "compile_fx", "filename": 13, "loc": "return compile_fx("}, {"line": 2000, "name": "compile_fx", "filename": 13, "loc": "return compile_fx("}, {"line": 2291, "name": "compile_fx", "filename": 13, "loc": "gm, graph_signature = aot_export_module("}, {"line": 1377, "name": "aot_export_module", "filename": 14, "loc": "fx_g, metadata, in_spec, out_spec = _aot_export_function("}, {"line": 1616, "name": "_aot_export_function", "filename": 14, "loc": "fx_g, meta = create_aot_dispatcher_function("}, {"line": 576, "name": "create_aot_dispatcher_function", "filename": 14, "loc": "return _create_aot_dispatcher_function("}, {"line": 836, "name": "_create_aot_dispatcher_function", "filename": 14, "loc": "compiled_fn, fw_metadata = compiler_fn("}, {"line": 126, "name": "aot_dispatch_export", "filename": 15, "loc": "graph, _, _ = aot_dispatch_base_graph("}, {"line": 214, "name": "aot_dispatch_base_graph", "filename": 16, "loc": "trace_structured("}], "has_payload": "7c74a144760cefb8a406d8dc8f099cf1"}
	ViewAndMutationMeta(input_info=[InputAliasInfo(is_leaf=True,
	                                              mutates_data=False,
	                                              mutates_metadata=False,
	                                              mutations_hidden_from_autograd=True,
	                                              mutations_under_no_grad_or_inference_mode=False,
	                                              mutation_inductor_storage_resize=False,
	                                              mutates_storage_metadata=False,
	                                              requires_grad=True,
	                                              keep_input_mutations=False),
	                               InputAliasInfo(is_leaf=True,
	                                              mutates_data=False,
	                                              mutates_metadata=False,
	                                              mutations_hidden_from_autograd=True,
	                                              mutations_under_no_grad_or_inference_mode=False,
	                                              mutation_inductor_storage_resize=False,
	                                              mutates_storage_metadata=False,
	                                              requires_grad=True,
	                                              keep_input_mutations=False),
	                               InputAliasInfo(is_leaf=True,
	                                              mutates_data=False,
	                                              mutates_metadata=False,
	                                              mutations_hidden_from_autograd=True,
	                                              mutations_under_no_grad_or_inference_mode=False,
	                                              mutation_inductor_storage_resize=False,
	                                              mutates_storage_metadata=False,
	                                              requires_grad=False,
	                                              keep_input_mutations=False),
	                               InputAliasInfo(is_leaf=True,
	                                              mutates_data=False,
	                                              mutates_metadata=False,
	                                              mutations_hidden_from_autograd=True,
	                                              mutations_under_no_grad_or_inference_mode=False,
	                                              mutation_inductor_storage_resize=False,
	                                              mutates_storage_metadata=False,
	                                              requires_grad=False,
	                                              keep_input_mutations=False),
	                               InputAliasInfo(is_leaf=True,
	                                              mutates_data=False,
	                                              mutates_metadata=False,
	                                              mutations_hidden_from_autograd=True,
	                                              mutations_under_no_grad_or_inference_mode=False,
	                                              mutation_inductor_storage_resize=False,
	                                              mutates_storage_metadata=False,
	                                              requires_grad=False,
	                                              keep_input_mutations=False),
	                               InputAliasInfo(is_leaf=True,
	                                              mutates_data=False,
	                                              mutates_metadata=False,
	                                              mutations_hidden_from_autograd=True,
	                                              mutations_under_no_grad_or_inference_mode=False,
	                                              mutation_inductor_storage_resize=False,
	                                              mutates_storage_metadata=False,
	                                              requires_grad=False,
	                                              keep_input_mutations=False)],
	                    output_info=[OutputAliasInfo(output_type=<OutputType.non_alias: 1>,
	                                                raw_type=<class 'torch._subclasses.functional_tensor.FunctionalTensor'>,
	                                                base_idx=None,
	                                                dynamic_dims=set(),
	                                                requires_grad=False,
	                                                functional_tensor=None),
	                                OutputAliasInfo(output_type=<OutputType.non_alias: 1>,
	                                                raw_type=<class 'torch._subclasses.functional_tensor.FunctionalTensor'>,
	                                                base_idx=None,
	                                                dynamic_dims=set(),
	                                                requires_grad=False,
	                                                functional_tensor=None)],
	                    num_intermediate_bases=0,
	                    keep_input_mutations=False,
	                    traced_tangents=[],
	                    subclass_inp_meta=[PlainTensorMeta(unwrapped_idx=0,
	                                                      memory_format=None),
	                                      PlainTensorMeta(unwrapped_idx=1,
	                                                      memory_format=None),
	                                      PlainTensorMeta(unwrapped_idx=2,
	                                                      memory_format=None),
	                                      PlainTensorMeta(unwrapped_idx=3,
	                                                      memory_format=None),
	                                      PlainTensorMeta(unwrapped_idx=4,
	                                                      memory_format=None),
	                                      PlainTensorMeta(unwrapped_idx=5,
	                                                      memory_format=None)],
	                    subclass_fw_graph_out_meta=[PlainTensorMeta(unwrapped_idx=0,
	                                                               memory_format=None),
	                                               PlainTensorMeta(unwrapped_idx=1,
	                                                               memory_format=None)],
	                    subclass_tangent_meta=[],
	                    is_train=False,
	                    traced_tangent_metas=None,
	                    num_symints_saved_for_bw=None,
	                    grad_enabled_mutation=None,
	                    deterministic=None,
	                    static_input_indices=[],
	                    tokens={},
	                    indices_of_inputs_that_requires_grad_with_mutations_in_bw=[],
	                    bw_donated_idxs=None,
	                    num_backward_tokens=0,
	                    num_graphsafe_rng_states=0,
	                    graphsafe_rng_state_index=None)
V0521 11:53:03.514000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:232] {"aot_inference_graph": {}, "stack": [{"line": 38, "name": "<module>", "filename": 0, "loc": "__invoke_main()"}, {"line": 35, "name": "__invoke_main", "filename": 0, "loc": "run_as_main(module, main_function)"}, {"line": 98, "name": "run_as_main", "filename": 1, "loc": "oss_run_as_main("}, {"line": 94, "name": "run_as_main", "filename": 2, "loc": "main()"}, {"line": 59, "name": "main", "filename": 3, "loc": "package_path = torch._inductor.aoti_compile_and_package(ep)"}, {"line": 150, "name": "aoti_compile_and_package", "filename": 11, "loc": "return aot_inductor_minifier_wrapper("}, {"line": 929, "name": "aot_inductor_minifier_wrapper", "filename": 12, "loc": "return func("}, {"line": 193, "name": "_aoti_compile_and_package_inner", "filename": 11, "loc": "aoti_files = aot_compile(gm, args, kwargs, options=inductor_configs)"}, {"line": 300, "name": "aot_compile", "filename": 11, "loc": "return compile_fx_aot("}, {"line": 1745, "name": "compile_fx_aot", "filename": 13, "loc": "compiled_artifacts = compile_fx("}, {"line": 1943, "name": "compile_fx", "filename": 13, "loc": "return compile_fx("}, {"line": 2000, "name": "compile_fx", "filename": 13, "loc": "return compile_fx("}, {"line": 2291, "name": "compile_fx", "filename": 13, "loc": "gm, graph_signature = aot_export_module("}, {"line": 1377, "name": "aot_export_module", "filename": 14, "loc": "fx_g, metadata, in_spec, out_spec = _aot_export_function("}, {"line": 1616, "name": "_aot_export_function", "filename": 14, "loc": "fx_g, meta = create_aot_dispatcher_function("}, {"line": 576, "name": "create_aot_dispatcher_function", "filename": 14, "loc": "return _create_aot_dispatcher_function("}, {"line": 836, "name": "_create_aot_dispatcher_function", "filename": 14, "loc": "compiled_fn, fw_metadata = compiler_fn("}, {"line": 126, "name": "aot_dispatch_export", "filename": 15, "loc": "graph, _, _ = aot_dispatch_base_graph("}, {"line": 232, "name": "aot_dispatch_base_graph", "filename": 16, "loc": "trace_structured("}], "has_payload": "26c679cec8693213484b3e7a9522f6b2"}
	class <lambda>(torch.nn.Module):
	    def forward(self, arg0_1: "f32[16, 10][10, 1]cuda:0", arg1_1: "f32[16][1]cuda:0", arg2_1: "f32[8, 10][10, 1]cuda:0", arg3_1: "f32[10, 20][20, 1]cuda:0", arg4_1: "f32[20, 30][30, 1]cuda:0", arg5_1: "f32[10, 30][30, 1]cuda:0"):
	         # File: /data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/nn/modules/linear.py:125 in forward, code: return F.linear(input, self.weight, self.bias)
	        permute: "f32[10, 16][1, 10]cuda:0" = torch.ops.aten.permute.default(arg0_1, [1, 0]);  arg0_1 = None
	        addmm: "f32[8, 16][16, 1]cuda:0" = torch.ops.aten.addmm.default(arg1_1, arg2_1, permute);  arg1_1 = arg2_1 = permute = None
	        
	         # File: /data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/nn/modules/activation.py:135 in forward, code: return F.relu(input, inplace=self.inplace)
	        relu: "f32[8, 16][16, 1]cuda:0" = torch.ops.aten.relu.default(addmm);  addmm = None
	        
	         # File: /data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/nn/modules/activation.py:329 in forward, code: return torch.sigmoid(input)
	        sigmoid: "f32[8, 16][16, 1]cuda:0" = torch.ops.aten.sigmoid.default(relu);  relu = None
	        
	         # File: /data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/scripts/shangdiy/aot.py:32 in forward, code: d = a * 3.14
	        mul: "f32[10, 20][20, 1]cuda:0" = torch.ops.aten.mul.Tensor(arg3_1, 3.14);  arg3_1 = None
	        
	         # File: /data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/scripts/shangdiy/aot.py:33 in forward, code: y = torch.addmm(c, d, b)
	        addmm_1: "f32[10, 30][30, 1]cuda:0" = torch.ops.aten.addmm.default(arg5_1, mul, arg4_1);  arg5_1 = mul = arg4_1 = None
	        
	         # File: /data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/scripts/shangdiy/aot.py:34 in forward, code: z = torch.nn.functional.gelu(y)
	        mul_1: "f32[10, 30][30, 1]cuda:0" = torch.ops.aten.mul.Tensor(addmm_1, 0.5)
	        mul_2: "f32[10, 30][30, 1]cuda:0" = torch.ops.aten.mul.Tensor(addmm_1, 0.7071067811865476);  addmm_1 = None
	        erf: "f32[10, 30][30, 1]cuda:0" = torch.ops.aten.erf.default(mul_2);  mul_2 = None
	        add: "f32[10, 30][30, 1]cuda:0" = torch.ops.aten.add.Tensor(erf, 1);  erf = None
	        mul_3: "f32[10, 30][30, 1]cuda:0" = torch.ops.aten.mul.Tensor(mul_1, add);  mul_1 = add = None
	        return (sigmoid, mul_3)
	        
V0521 11:53:03.520000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "3b661fe498924ae26313eeacf2e174ab"}
	{
	"name": "create_aot_dispatcher_function",
	"ts": 1747853583520522.2,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:03.524000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "366b4804c05fb35fc023efce2c11e1a2"}
	{
	"name": "compile_fx.<locals>.fw_compiler_base",
	"ts": 1747853583524426.0,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:03.526000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "20ec9ca64474ec8065d0dc07644b07c8"}
	{
	"name": "_recursive_joint_graph_passes",
	"ts": 1747853583526359.2,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:03.810000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "06b2b428d1122c201d23f09c1bd5ec83"}
	{
	"name": "pad_mm_benchmark",
	"ts": 1747853583810193.5,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:03.812000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "20e204f9dbf86439fe2f3ba9826ffe82"}
	{
	"name": "pad_mm_benchmark_get_do_bench",
	"ts": 1747853583812796.2,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:03.815000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "ca90c75b64b02d551cfd5b64fd59d32d"}
	{
	"name": "pad_mm_benchmark_get_do_bench",
	"ts": 1747853583815313.2,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:03.826000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "263720d993378e56f0e65619f3b44b4d"}
	{
	"name": "pad_mm_benchmark",
	"ts": 1747853583826039.8,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:03.832000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "3031b170565a9d545a86b3cde03776ba"}
	{
	"name": "pad_mm_benchmark",
	"ts": 1747853583831869.0,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:03.835000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "389b2544d09816dd6451708c2c650c0d"}
	{
	"name": "pad_mm_benchmark_get_do_bench",
	"ts": 1747853583835155.5,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:03.838000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "3477a486ddea4f820422df04e4e9d81a"}
	{
	"name": "pad_mm_benchmark_get_do_bench",
	"ts": 1747853583838121.8,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:03.842000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "f6e84d629f724f472b6ebbbcd970e4e7"}
	{
	"name": "pad_mm_benchmark",
	"ts": 1747853583841942.8,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:03.845000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "7a0a7a12860bf894bd7270a97125a564"}
	{
	"name": "_recursive_joint_graph_passes",
	"ts": 1747853583845295.5,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:03.848000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "89986201fd2e44d8f7a42ee3992224f9"}
	{
	"name": "inductor_compile",
	"ts": 1747853583848613.0,
	"args": {
	"fn_name": "compile_fx_inner",
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:03.879000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "586cbbb316065aa4a91fce0220b10e22"}
	{
	"name": "fx_codegen_and_compile",
	"ts": 1747853583878956.0,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:03.894000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_logging/structured.py:27] {"str": ["/usr/local/fbcode/platform010/lib/python3.12/contextlib.py", 17]}
V0521 11:53:03.895000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_logging/structured.py:27] {"str": ["/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/_dynamo/repro/after_aot.py", 18]}
V0521 11:53:03.897000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_logging/structured.py:27] {"str": ["/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/_inductor/fb/utils.py", 19]}
V0521 11:53:03.898000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_inductor/compile_fx.py:1109] {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "stack": [{"line": 38, "name": "<module>", "filename": 0, "loc": "__invoke_main()"}, {"line": 35, "name": "__invoke_main", "filename": 0, "loc": "run_as_main(module, main_function)"}, {"line": 98, "name": "run_as_main", "filename": 1, "loc": "oss_run_as_main("}, {"line": 94, "name": "run_as_main", "filename": 2, "loc": "main()"}, {"line": 59, "name": "main", "filename": 3, "loc": "package_path = torch._inductor.aoti_compile_and_package(ep)"}, {"line": 150, "name": "aoti_compile_and_package", "filename": 11, "loc": "return aot_inductor_minifier_wrapper("}, {"line": 929, "name": "aot_inductor_minifier_wrapper", "filename": 12, "loc": "return func("}, {"line": 193, "name": "_aoti_compile_and_package_inner", "filename": 11, "loc": "aoti_files = aot_compile(gm, args, kwargs, options=inductor_configs)"}, {"line": 300, "name": "aot_compile", "filename": 11, "loc": "return compile_fx_aot("}, {"line": 1745, "name": "compile_fx_aot", "filename": 13, "loc": "compiled_artifacts = compile_fx("}, {"line": 1943, "name": "compile_fx", "filename": 13, "loc": "return compile_fx("}, {"line": 2000, "name": "compile_fx", "filename": 13, "loc": "return compile_fx("}, {"line": 2343, "name": "compile_fx", "filename": 13, "loc": "return inference_compiler(unlifted_gm, example_inputs_)"}, {"line": 483, "name": "__call__", "filename": 14, "loc": "return self.compiler_fn(gm, example_inputs)"}, {"line": 2184, "name": "fw_compiler_base", "filename": 13, "loc": "return inner_compile("}, {"line": 81, "name": "inner", "filename": 17, "loc": "return func(*args, **kwds)"}, {"line": 710, "name": "compile_fx_inner", "filename": 13, "loc": "return wrap_compiler_debug(_compile_fx_inner, compiler_name=\"inductor\")("}, {"line": 124, "name": "debug_wrapper", "filename": 18, "loc": "inner_compiled_fn = compiler_fn(gm, example_inputs)"}, {"line": 167, "name": "newFunction", "filename": 19, "loc": "return old_func(*args, **kwargs)"}, {"line": 853, "name": "_compile_fx_inner", "filename": 13, "loc": "mb_compiled_graph = fx_codegen_and_compile("}, {"line": 1503, "name": "fx_codegen_and_compile", "filename": 13, "loc": "return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)"}, {"line": 1109, "name": "codegen_and_compile", "filename": 13, "loc": "trace_structured("}], "has_payload": "2cf942f6175d8ea725613822361a2783"}
	
	import os
	os.environ['TORCH_TRACE'] = '/home/shangdiy/my_trace_log_dir'
	os.environ['TORCH_COMPILE_DEBUG'] = '1'
	os.environ['TORCH_LOGS'] = '+inductor'
	os.environ['PYTORCH_DDP_USE_SIDE_STREAM'] = '0'
	os.environ['TORCHINDUCTOR_CACHE_DIR'] = '/var/tmp/torchinductor_shangdiy'
	
	import torch
	from torch import tensor, device
	import torch.fx as fx
	from torch._dynamo.testing import rand_strided
	from math import inf
	import torch._inductor.inductor_prims
	
	import torch._dynamo.config
	import torch._inductor.config
	import torch._functorch.config
	import torch.fx.experimental._config
	torch._dynamo.config.specialize_int = False
	torch._dynamo.config.specialize_float = False
	torch._dynamo.config.assume_static_by_default = True
	torch._dynamo.config.automatic_dynamic_shapes = True
	torch._dynamo.config.capture_scalar_outputs = False
	torch._dynamo.config.capture_dynamic_output_shape_ops = False
	torch._dynamo.config.prefer_deferred_runtime_asserts_over_guards = False
	torch._dynamo.config.do_not_emit_runtime_asserts = False
	torch._dynamo.config.allow_rnn = False
	torch._inductor.config.cpp_wrapper = True
	torch._inductor.config.triton.cudagraphs = False
	torch._inductor.config.triton.autotune_cublasLt = False
	torch._inductor.config.triton.autotune_at_compile_time = True
	torch._inductor.config.triton.store_cubin = True
	torch._inductor.config.aot_inductor.output_path = 'cwhkamk7hukdm5d55b4fxkyyok5x57mzbc2hzfy243x4xp2dcbtz'
	torch._inductor.config.aot_inductor.serialized_in_spec = '[1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}, {"type": null, "context": null, "children_spec": []}, {"type": null, "context": null, "children_spec": []}, {"type": null, "context": null, "children_spec": []}]}, {"type": "builtins.dict", "context": "[]", "children_spec": []}]}]'
	torch._inductor.config.aot_inductor.serialized_out_spec = '[1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}, {"type": null, "context": null, "children_spec": []}]}]'
	torch._inductor.config.aot_inductor.package = True
	torch._functorch.config.functionalize_rng_ops = False
	torch._functorch.config.fake_tensor_allow_unsafe_data_ptr_access = True
	torch._functorch.config.unlift_effect_tokens = False
	
	
	
	isolate_fails_code_str = None
	
	torch.ops.load_library("//caffe2/torch/fb/sparsenn:sparsenn_operators_gpu")
	torch.ops.load_library("//caffe2/torch/fb/sparsenn:sparsenn_operators")
	torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
	torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
	
	"""
	To run this script in fbcode:
	- Create a directory (//scripts/{your_unixname}/repro)
	- Put this file in scripts/{your_unixname}/repro/fx_graph_runnable.py
	- Add a TARGETS file that looks like the following
	- `buck2 run //scripts/{your_unixname}/repro:repro`
	
	NOTE: you may need additional deps to actually be able to run the script.
	```
	# Contents of TARGETS file
	load("@fbcode_macros//build_defs:python_binary.bzl", "python_binary")
	
	python_binary(
	    name = "repro",
	    main_src = "fx_graph_runnable.py",
	    deps = [
	        "//caffe2:torch",
	        "//caffe2/torch/fb/sparsenn:sparsenn_operators_gpu",
	        "//caffe2/torch/fb/sparsenn:sparsenn_operators",
	        "//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu",
	        "//deeplearning/fbgemm/fbgemm_gpu:sparse_ops",
	    ],
	)
	```
	"""
	
	# torch version: 2.8.0a0+fb
	# torch cuda version: 12.4.0
	# CUDA Info: 
	# nvcc not found
	# GPU Hardware Info: 
	# NVIDIA PG509-210 : 1 
	
	
	from torch.nn import *
	class Repro(torch.nn.Module):
	    def __init__(self) -> None:
	        super().__init__()
	        self.fc1 = Module().cuda()
	
	    
	    
	    def forward(self):
	        arg2_1, arg3_1, arg4_1, arg5_1, = fx_pytree.tree_flatten_spec([], self._in_spec)
	        fc1_weight = self.fc1.weight
	        fc1_bias = self.fc1.bias
	        permute = torch.ops.aten.permute.default(fc1_weight, [1, 0]);  fc1_weight = None
	        addmm = torch.ops.aten.addmm.default(fc1_bias, arg2_1, permute);  fc1_bias = arg2_1 = permute = None
	        relu = torch.ops.aten.relu.default(addmm);  addmm = None
	        sigmoid = torch.ops.aten.sigmoid.default(relu);  relu = None
	        mul = torch.ops.aten.mul.Tensor(arg3_1, 3.14);  arg3_1 = None
	        addmm_1 = torch.ops.aten.addmm.default(arg5_1, mul, arg4_1);  arg5_1 = mul = arg4_1 = None
	        mul_1 = torch.ops.aten.mul.Tensor(addmm_1, 0.5)
	        mul_2 = torch.ops.aten.mul.Tensor(addmm_1, 0.7071067811865476);  addmm_1 = None
	        erf = torch.ops.aten.erf.default(mul_2);  mul_2 = None
	        add = torch.ops.aten.add.Tensor(erf, 1);  erf = None
	        mul_3 = torch.ops.aten.mul.Tensor(mul_1, add);  mul_1 = add = None
	        return (sigmoid, mul_3)
	        
	def load_args(reader):
	    buf0 = reader.storage(None, 320, device=device(type='cuda', index=0))
	    reader.tensor(buf0, (8, 10), is_leaf=True)  # arg2_1
	    buf1 = reader.storage(None, 800, device=device(type='cuda', index=0))
	    reader.tensor(buf1, (10, 20), is_leaf=True)  # arg3_1
	    buf2 = reader.storage(None, 2400, device=device(type='cuda', index=0))
	    reader.tensor(buf2, (20, 30), is_leaf=True)  # arg4_1
	    buf3 = reader.storage(None, 1200, device=device(type='cuda', index=0))
	    reader.tensor(buf3, (10, 30), is_leaf=True)  # arg5_1
	load_args._version = 0
	mod = Repro()
	if __name__ == '__main__':
	    from torch._dynamo.repro.after_aot import run_repro
	    with torch.no_grad():
	        run_repro(mod, load_args, accuracy=False, command='run', save_dir=None, tracing_mode='real', check_str=None)
	        # To run it separately, do 
	        # mod, args = run_repro(mod, load_args, accuracy=False, command='get_args', save_dir=None, tracing_mode='real', check_str=None)
	        # mod(*args)
V0521 11:53:03.904000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "cb6668bf0cbe96819d3f592a1944efff"}
	{
	"name": "additional_fake_tensor_prop",
	"ts": 1747853583904821.5,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:03.923000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "be711f4d87b6bdc052595be49f6a883e"}
	{
	"name": "additional_fake_tensor_prop",
	"ts": 1747853583923403.0,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:03.929000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_inductor/compile_fx.py:1158] {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "stack": [{"line": 38, "name": "<module>", "filename": 0, "loc": "__invoke_main()"}, {"line": 35, "name": "__invoke_main", "filename": 0, "loc": "run_as_main(module, main_function)"}, {"line": 98, "name": "run_as_main", "filename": 1, "loc": "oss_run_as_main("}, {"line": 94, "name": "run_as_main", "filename": 2, "loc": "main()"}, {"line": 59, "name": "main", "filename": 3, "loc": "package_path = torch._inductor.aoti_compile_and_package(ep)"}, {"line": 150, "name": "aoti_compile_and_package", "filename": 11, "loc": "return aot_inductor_minifier_wrapper("}, {"line": 929, "name": "aot_inductor_minifier_wrapper", "filename": 12, "loc": "return func("}, {"line": 193, "name": "_aoti_compile_and_package_inner", "filename": 11, "loc": "aoti_files = aot_compile(gm, args, kwargs, options=inductor_configs)"}, {"line": 300, "name": "aot_compile", "filename": 11, "loc": "return compile_fx_aot("}, {"line": 1745, "name": "compile_fx_aot", "filename": 13, "loc": "compiled_artifacts = compile_fx("}, {"line": 1943, "name": "compile_fx", "filename": 13, "loc": "return compile_fx("}, {"line": 2000, "name": "compile_fx", "filename": 13, "loc": "return compile_fx("}, {"line": 2343, "name": "compile_fx", "filename": 13, "loc": "return inference_compiler(unlifted_gm, example_inputs_)"}, {"line": 483, "name": "__call__", "filename": 14, "loc": "return self.compiler_fn(gm, example_inputs)"}, {"line": 2184, "name": "fw_compiler_base", "filename": 13, "loc": "return inner_compile("}, {"line": 81, "name": "inner", "filename": 17, "loc": "return func(*args, **kwds)"}, {"line": 710, "name": "compile_fx_inner", "filename": 13, "loc": "return wrap_compiler_debug(_compile_fx_inner, compiler_name=\"inductor\")("}, {"line": 124, "name": "debug_wrapper", "filename": 18, "loc": "inner_compiled_fn = compiler_fn(gm, example_inputs)"}, {"line": 167, "name": "newFunction", "filename": 19, "loc": "return old_func(*args, **kwargs)"}, {"line": 853, "name": "_compile_fx_inner", "filename": 13, "loc": "mb_compiled_graph = fx_codegen_and_compile("}, {"line": 1503, "name": "fx_codegen_and_compile", "filename": 13, "loc": "return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)"}, {"line": 1158, "name": "codegen_and_compile", "filename": 13, "loc": "trace_structured("}], "has_payload": "352a9b4053331f4a5f7587f1b9b2d74b"}
	class <lambda>(torch.nn.Module):
	    def forward(self):
	        arg2_1: "f32[8, 10][10, 1]cuda:0"; arg3_1: "f32[10, 20][20, 1]cuda:0"; arg4_1: "f32[20, 30][30, 1]cuda:0"; arg5_1: "f32[10, 30][30, 1]cuda:0"; 
	    
	        arg2_1, arg3_1, arg4_1, arg5_1, = fx_pytree.tree_flatten_spec([], self._in_spec)
	        # No stacktrace found for following nodes
	        fc1_weight: "f32[16, 10][10, 1]cuda:0" = self.fc1.weight
	        fc1_bias: "f32[16][1]cuda:0" = self.fc1.bias
	        
	         # File: /data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/nn/modules/linear.py:125 in forward, code: return F.linear(input, self.weight, self.bias)
	        permute: "f32[10, 16][1, 10]cuda:0" = torch.ops.aten.permute.default(fc1_weight, [1, 0]);  fc1_weight = None
	        addmm: "f32[8, 16][16, 1]cuda:0" = torch.ops.aten.addmm.default(fc1_bias, arg2_1, permute);  fc1_bias = arg2_1 = permute = None
	        
	         # File: /data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/nn/modules/activation.py:135 in forward, code: return F.relu(input, inplace=self.inplace)
	        relu: "f32[8, 16][16, 1]cuda:0" = torch.ops.aten.relu.default(addmm);  addmm = None
	        
	         # File: /data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/nn/modules/activation.py:329 in forward, code: return torch.sigmoid(input)
	        sigmoid: "f32[8, 16][16, 1]cuda:0" = torch.ops.aten.sigmoid.default(relu);  relu = None
	        
	         # File: /data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/scripts/shangdiy/aot.py:32 in forward, code: d = a * 3.14
	        mul: "f32[10, 20][20, 1]cuda:0" = torch.ops.aten.mul.Tensor(arg3_1, 3.14);  arg3_1 = None
	        
	         # File: /data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/scripts/shangdiy/aot.py:33 in forward, code: y = torch.addmm(c, d, b)
	        addmm_1: "f32[10, 30][30, 1]cuda:0" = torch.ops.aten.addmm.default(arg5_1, mul, arg4_1);  arg5_1 = mul = arg4_1 = None
	        
	         # File: /data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/scripts/shangdiy/aot.py:34 in forward, code: z = torch.nn.functional.gelu(y)
	        mul_1: "f32[10, 30][30, 1]cuda:0" = torch.ops.aten.mul.Tensor(addmm_1, 0.5)
	        mul_2: "f32[10, 30][30, 1]cuda:0" = torch.ops.aten.mul.Tensor(addmm_1, 0.7071067811865476);  addmm_1 = None
	        erf: "f32[10, 30][30, 1]cuda:0" = torch.ops.aten.erf.default(mul_2);  mul_2 = None
	        add: "f32[10, 30][30, 1]cuda:0" = torch.ops.aten.add.Tensor(erf, 1);  erf = None
	        mul_3: "f32[10, 30][30, 1]cuda:0" = torch.ops.aten.mul.Tensor(mul_1, add);  mul_1 = add = None
	        return (sigmoid, mul_3)
	        
V0521 11:53:03.932000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "5f5c234d8970f53027eb201523693a7b"}
	{
	"name": "_recursive_post_grad_passes",
	"ts": 1747853583932532.8,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:04.002000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "318303012ee63c2dcf77b5a08832ae77"}
	{
	"name": "_recursive_post_grad_passes",
	"ts": 1747853584002705.5,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:04.015000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_inductor/compile_fx.py:1194] {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "stack": [{"line": 38, "name": "<module>", "filename": 0, "loc": "__invoke_main()"}, {"line": 35, "name": "__invoke_main", "filename": 0, "loc": "run_as_main(module, main_function)"}, {"line": 98, "name": "run_as_main", "filename": 1, "loc": "oss_run_as_main("}, {"line": 94, "name": "run_as_main", "filename": 2, "loc": "main()"}, {"line": 59, "name": "main", "filename": 3, "loc": "package_path = torch._inductor.aoti_compile_and_package(ep)"}, {"line": 150, "name": "aoti_compile_and_package", "filename": 11, "loc": "return aot_inductor_minifier_wrapper("}, {"line": 929, "name": "aot_inductor_minifier_wrapper", "filename": 12, "loc": "return func("}, {"line": 193, "name": "_aoti_compile_and_package_inner", "filename": 11, "loc": "aoti_files = aot_compile(gm, args, kwargs, options=inductor_configs)"}, {"line": 300, "name": "aot_compile", "filename": 11, "loc": "return compile_fx_aot("}, {"line": 1745, "name": "compile_fx_aot", "filename": 13, "loc": "compiled_artifacts = compile_fx("}, {"line": 1943, "name": "compile_fx", "filename": 13, "loc": "return compile_fx("}, {"line": 2000, "name": "compile_fx", "filename": 13, "loc": "return compile_fx("}, {"line": 2343, "name": "compile_fx", "filename": 13, "loc": "return inference_compiler(unlifted_gm, example_inputs_)"}, {"line": 483, "name": "__call__", "filename": 14, "loc": "return self.compiler_fn(gm, example_inputs)"}, {"line": 2184, "name": "fw_compiler_base", "filename": 13, "loc": "return inner_compile("}, {"line": 81, "name": "inner", "filename": 17, "loc": "return func(*args, **kwds)"}, {"line": 710, "name": "compile_fx_inner", "filename": 13, "loc": "return wrap_compiler_debug(_compile_fx_inner, compiler_name=\"inductor\")("}, {"line": 124, "name": "debug_wrapper", "filename": 18, "loc": "inner_compiled_fn = compiler_fn(gm, example_inputs)"}, {"line": 167, "name": "newFunction", "filename": 19, "loc": "return old_func(*args, **kwargs)"}, {"line": 853, "name": "_compile_fx_inner", "filename": 13, "loc": "mb_compiled_graph = fx_codegen_and_compile("}, {"line": 1503, "name": "fx_codegen_and_compile", "filename": 13, "loc": "return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)"}, {"line": 1194, "name": "codegen_and_compile", "filename": 13, "loc": "trace_structured("}], "has_payload": "a4e7e7eacd5f4b3db10a51abc333e0fe"}
	class <lambda>(torch.nn.Module):
	    def forward(self):
	        arg2_1: "f32[8, 10][10, 1]cuda:0"; arg3_1: "f32[10, 20][20, 1]cuda:0"; arg4_1: "f32[20, 30][30, 1]cuda:0"; arg5_1: "f32[10, 30][30, 1]cuda:0"; 
	    
	        arg2_1, arg3_1, arg4_1, arg5_1, = fx_pytree.tree_flatten_spec([], self._in_spec)
	        # No stacktrace found for following nodes
	        fc1_weight: "f32[16, 10][10, 1]cuda:0" = self.fc1.weight
	        fc1_bias: "f32[16][1]cuda:0" = self.fc1.bias
	        
	         # File: /data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/nn/modules/linear.py:125 in forward, code: return F.linear(input, self.weight, self.bias)
	        permute: "f32[10, 16][1, 10]cuda:0" = torch.ops.aten.permute.default(fc1_weight, [1, 0]);  fc1_weight = None
	        
	        # No stacktrace found for following nodes
	        mm_default_1: "f32[8, 16][16, 1]cuda:0" = torch.ops.aten.mm.default(arg2_1, permute);  arg2_1 = permute = None
	        add_tensor_1: "f32[8, 16][16, 1]cuda:0" = torch.ops.aten.add.Tensor(mm_default_1, fc1_bias);  mm_default_1 = fc1_bias = None
	        
	         # File: /data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/nn/modules/activation.py:135 in forward, code: return F.relu(input, inplace=self.inplace)
	        relu: "f32[8, 16][16, 1]cuda:0" = torch.ops.aten.relu.default(add_tensor_1);  add_tensor_1 = None
	        
	         # File: /data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/nn/modules/activation.py:329 in forward, code: return torch.sigmoid(input)
	        sigmoid: "f32[8, 16][16, 1]cuda:0" = torch.ops.aten.sigmoid.default(relu);  relu = None
	        
	         # File: /data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/scripts/shangdiy/aot.py:32 in forward, code: d = a * 3.14
	        mul: "f32[10, 20][20, 1]cuda:0" = torch.ops.aten.mul.Tensor(arg3_1, 3.14);  arg3_1 = None
	        
	        # No stacktrace found for following nodes
	        mm_default: "f32[10, 30][30, 1]cuda:0" = torch.ops.aten.mm.default(mul, arg4_1);  mul = arg4_1 = None
	        add_tensor: "f32[10, 30][30, 1]cuda:0" = torch.ops.aten.add.Tensor(mm_default, arg5_1);  mm_default = arg5_1 = None
	        
	         # File: /data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/scripts/shangdiy/aot.py:34 in forward, code: z = torch.nn.functional.gelu(y)
	        mul_1: "f32[10, 30][30, 1]cuda:0" = torch.ops.aten.mul.Tensor(add_tensor, 0.5)
	        mul_2: "f32[10, 30][30, 1]cuda:0" = torch.ops.aten.mul.Tensor(add_tensor, 0.7071067811865476);  add_tensor = None
	        erf: "f32[10, 30][30, 1]cuda:0" = torch.ops.aten.erf.default(mul_2);  mul_2 = None
	        add: "f32[10, 30][30, 1]cuda:0" = torch.ops.aten.add.Tensor(erf, 1);  erf = None
	        mul_3: "f32[10, 30][30, 1]cuda:0" = torch.ops.aten.mul.Tensor(mul_1, add);  mul_1 = add = None
	        return (sigmoid, mul_3)
	        
V0521 11:53:04.019000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_inductor/compile_fx.py:1206] {"artifact": {"name": "inductor_post_to_pre_grad_nodes", "encoding": "json"}, "stack": [{"line": 38, "name": "<module>", "filename": 0, "loc": "__invoke_main()"}, {"line": 35, "name": "__invoke_main", "filename": 0, "loc": "run_as_main(module, main_function)"}, {"line": 98, "name": "run_as_main", "filename": 1, "loc": "oss_run_as_main("}, {"line": 94, "name": "run_as_main", "filename": 2, "loc": "main()"}, {"line": 59, "name": "main", "filename": 3, "loc": "package_path = torch._inductor.aoti_compile_and_package(ep)"}, {"line": 150, "name": "aoti_compile_and_package", "filename": 11, "loc": "return aot_inductor_minifier_wrapper("}, {"line": 929, "name": "aot_inductor_minifier_wrapper", "filename": 12, "loc": "return func("}, {"line": 193, "name": "_aoti_compile_and_package_inner", "filename": 11, "loc": "aoti_files = aot_compile(gm, args, kwargs, options=inductor_configs)"}, {"line": 300, "name": "aot_compile", "filename": 11, "loc": "return compile_fx_aot("}, {"line": 1745, "name": "compile_fx_aot", "filename": 13, "loc": "compiled_artifacts = compile_fx("}, {"line": 1943, "name": "compile_fx", "filename": 13, "loc": "return compile_fx("}, {"line": 2000, "name": "compile_fx", "filename": 13, "loc": "return compile_fx("}, {"line": 2343, "name": "compile_fx", "filename": 13, "loc": "return inference_compiler(unlifted_gm, example_inputs_)"}, {"line": 483, "name": "__call__", "filename": 14, "loc": "return self.compiler_fn(gm, example_inputs)"}, {"line": 2184, "name": "fw_compiler_base", "filename": 13, "loc": "return inner_compile("}, {"line": 81, "name": "inner", "filename": 17, "loc": "return func(*args, **kwds)"}, {"line": 710, "name": "compile_fx_inner", "filename": 13, "loc": "return wrap_compiler_debug(_compile_fx_inner, compiler_name=\"inductor\")("}, {"line": 124, "name": "debug_wrapper", "filename": 18, "loc": "inner_compiled_fn = compiler_fn(gm, example_inputs)"}, {"line": 167, "name": "newFunction", "filename": 19, "loc": "return old_func(*args, **kwargs)"}, {"line": 853, "name": "_compile_fx_inner", "filename": 13, "loc": "mb_compiled_graph = fx_codegen_and_compile("}, {"line": 1503, "name": "fx_codegen_and_compile", "filename": 13, "loc": "return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)"}, {"line": 1206, "name": "codegen_and_compile", "filename": 13, "loc": "trace_structured("}], "has_payload": "71818e0a3ebb21821524041d12c0bea9"}
	{"permute": [{"name": "linear", "target": "aten.linear.default", "graph_id": 139811696436544, "pass_name": "Interpreter_PropagateUnbackedSymInts", "action": "create", "from_node": []}], "mm_default_1": [{"name": "", "target": "", "graph_id": -1, "pass_name": "pass_pattern_2", "action": "create", "from_node": []}, {"name": "", "target": "", "graph_id": -1, "pass_name": "pattern_matcher", "action": "create", "from_node": []}, {"name": "mm", "target": "aten.mm.default", "graph_id": 139810834098720, "pass_name": "Interpreter_Replacer", "action": "replace", "from_node": [{"name": "addmm", "target": "aten.addmm.default", "graph_id": 139811521629216, "pass_name": "replace_by_example", "action": "replace", "from_node": [{"name": "linear", "target": "aten.linear.default", "graph_id": 139811696436544, "pass_name": "Interpreter_PropagateUnbackedSymInts", "action": "create", "from_node": []}]}]}], "add_tensor_1": [{"name": "add", "target": "aten.add.Tensor", "graph_id": 139810834098720, "pass_name": "Interpreter_Replacer", "action": "replace", "from_node": [{"name": "addmm", "target": "aten.addmm.default", "graph_id": 139811521629216, "pass_name": "replace_by_example", "action": "replace", "from_node": [{"name": "linear", "target": "aten.linear.default", "graph_id": 139811696436544, "pass_name": "Interpreter_PropagateUnbackedSymInts", "action": "create", "from_node": []}]}]}, {"name": "addmm", "target": "aten.addmm.default", "graph_id": 139811521629216, "pass_name": "pass_pattern_2", "action": "replace+create", "from_node": [{"name": "linear", "target": "aten.linear.default", "graph_id": 139811696436544, "pass_name": "Interpreter_PropagateUnbackedSymInts", "action": "create", "from_node": []}]}, {"name": "addmm", "target": "aten.addmm.default", "graph_id": 139811521629216, "pass_name": "pattern_matcher", "action": "replace+create", "from_node": [{"name": "linear", "target": "aten.linear.default", "graph_id": 139811696436544, "pass_name": "Interpreter_PropagateUnbackedSymInts", "action": "create", "from_node": []}]}], "relu": [{"name": "relu", "target": "aten.relu.default", "graph_id": 139811696436544, "pass_name": "Interpreter_PropagateUnbackedSymInts", "action": "create", "from_node": []}], "sigmoid": [{"name": "sigmoid", "target": "aten.sigmoid.default", "graph_id": 139811696436544, "pass_name": "Interpreter_PropagateUnbackedSymInts", "action": "create", "from_node": []}], "mul": [{"name": "mul", "target": "aten.mul.Tensor", "graph_id": 139811696436544, "pass_name": "Interpreter_PropagateUnbackedSymInts", "action": "create", "from_node": []}], "mm_default": [{"name": "", "target": "", "graph_id": -1, "pass_name": "pass_pattern_2", "action": "create", "from_node": []}, {"name": "", "target": "", "graph_id": -1, "pass_name": "pattern_matcher", "action": "create", "from_node": []}, {"name": "mm", "target": "aten.mm.default", "graph_id": 139809927557632, "pass_name": "Interpreter_Replacer", "action": "replace", "from_node": [{"name": "addmm_1", "target": "aten.addmm.default", "graph_id": 139811521629216, "pass_name": "replace_by_example", "action": "replace", "from_node": [{"name": "addmm", "target": "aten.addmm.default", "graph_id": 139811696436544, "pass_name": "Interpreter_PropagateUnbackedSymInts", "action": "create", "from_node": []}]}]}], "add_tensor": [{"name": "add", "target": "aten.add.Tensor", "graph_id": 139809927557632, "pass_name": "Interpreter_Replacer", "action": "replace", "from_node": [{"name": "addmm_1", "target": "aten.addmm.default", "graph_id": 139811521629216, "pass_name": "replace_by_example", "action": "replace", "from_node": [{"name": "addmm", "target": "aten.addmm.default", "graph_id": 139811696436544, "pass_name": "Interpreter_PropagateUnbackedSymInts", "action": "create", "from_node": []}]}]}, {"name": "addmm_1", "target": "aten.addmm.default", "graph_id": 139811521629216, "pass_name": "pass_pattern_2", "action": "replace+create", "from_node": [{"name": "addmm", "target": "aten.addmm.default", "graph_id": 139811696436544, "pass_name": "Interpreter_PropagateUnbackedSymInts", "action": "create", "from_node": []}]}, {"name": "addmm_1", "target": "aten.addmm.default", "graph_id": 139811521629216, "pass_name": "pattern_matcher", "action": "replace+create", "from_node": [{"name": "addmm", "target": "aten.addmm.default", "graph_id": 139811696436544, "pass_name": "Interpreter_PropagateUnbackedSymInts", "action": "create", "from_node": []}]}, {"name": "addmm_1", "target": "aten.addmm.default", "graph_id": 139811521629216, "pass_name": "pass_pattern_2", "action": "replace+create", "from_node": [{"name": "addmm", "target": "aten.addmm.default", "graph_id": 139811696436544, "pass_name": "Interpreter_PropagateUnbackedSymInts", "action": "create", "from_node": []}]}, {"name": "addmm_1", "target": "aten.addmm.default", "graph_id": 139811521629216, "pass_name": "pattern_matcher", "action": "replace+create", "from_node": [{"name": "addmm", "target": "aten.addmm.default", "graph_id": 139811696436544, "pass_name": "Interpreter_PropagateUnbackedSymInts", "action": "create", "from_node": []}]}], "mul_1": [{"name": "gelu", "target": "aten.gelu.default", "graph_id": 139811696436544, "pass_name": "Interpreter_PropagateUnbackedSymInts", "action": "create", "from_node": []}], "mul_2": [{"name": "gelu", "target": "aten.gelu.default", "graph_id": 139811696436544, "pass_name": "Interpreter_PropagateUnbackedSymInts", "action": "create", "from_node": []}], "erf": [{"name": "gelu", "target": "aten.gelu.default", "graph_id": 139811696436544, "pass_name": "Interpreter_PropagateUnbackedSymInts", "action": "create", "from_node": []}], "add": [{"name": "gelu", "target": "aten.gelu.default", "graph_id": 139811696436544, "pass_name": "Interpreter_PropagateUnbackedSymInts", "action": "create", "from_node": []}], "mul_3": [{"name": "gelu", "target": "aten.gelu.default", "graph_id": 139811696436544, "pass_name": "Interpreter_PropagateUnbackedSymInts", "action": "create", "from_node": []}]}
V0521 11:53:04.035000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "96cbfe7d3ae715d12fbdb12a637862c2"}
	{
	"name": "GraphLowering.run",
	"ts": 1747853584035216.5,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:04.186000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "169cb8e3c3f015550ec51fcb875cd6bf"}
	{
	"name": "GraphLowering.run",
	"ts": 1747853584186048.5,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:04.188000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "d080e983fc005afbbba17d596e196085"}
	{
	"name": "GraphLowering.compile_to_fn",
	"ts": 1747853584188346.2,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:04.190000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "3fc448e243657d74ad7b65b2ccfe96d6"}
	{
	"name": "GraphLowering.codegen",
	"ts": 1747853584190310.5,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:04.196000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "589b830a5a590450b1e8a447cbb2c71e"}
	{
	"name": "Scheduler.__init__",
	"ts": 1747853584195999.0,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:04.264000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "8bdd4612d9c21a36048b66d07996711d"}
	{
	"name": "Scheduler.fused_nodes",
	"ts": 1747853584264661.2,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:04.268000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "4b98765c41e923a11b4615ae3218938a"}
	{
	"name": "Scheduler.fused_nodes",
	"ts": 1747853584268179.2,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:04.301000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "4448f0194520b3f17f002d767949f4d6"}
	{
	"name": "Scheduler.__init__",
	"ts": 1747853584301159.8,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:04.303000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "b3d5b1657c0e49cf1592a1537270f768"}
	{
	"name": "Scheduler.codegen",
	"ts": 1747853584303451.0,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:04.360000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "c37713dfbf3b0961875e261626d96eb8"}
	{
	"name": "inductor_codecache_torch_key",
	"ts": 1747853584360758.5,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:04.363000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "d463538e93ca4be75fe963864ef1ca71"}
	{
	"name": "inductor_codecache_torch_key",
	"ts": 1747853584363392.2,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:04.422000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "de6ee4051c0712480cf3f5f85e6fe937"}
	{
	"name": "Scheduler.codegen",
	"ts": 1747853584422070.8,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:04.426000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "4e93bcf6d30f8dc5a682773479cbfd9d"}
	{
	"name": "CppWrapperGpu.generate",
	"ts": 1747853584426113.2,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:04.428000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "5d77d652c1a7945cbe2b77d97c07e9cc"}
	{
	"name": "CppWrapperCpu.generate",
	"ts": 1747853584428268.8,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:04.432000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "3fb60349a92bf27720483b86288127dd"}
	{
	"name": "PythonWrapperCodegen.generate",
	"ts": 1747853584432830.8,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:04.442000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "7ec9a78387b3ef552180ca15ce43493d"}
	{
	"name": "async_compile.precompile",
	"ts": 1747853584442858.5,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:04.465000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "6cff99cb65a7305a9da27276445f1aa6"}
	{
	"name": "async_compile.precompile",
	"ts": 1747853584465275.0,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:04.468000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "eedfe778126513774d37c378af4a2463"}
	{
	"name": "async_compile.precompile",
	"ts": 1747853584468831.5,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:06.172000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "f1f8fddfd722377491b3feec4d48b8d2"}
	{
	"name": "async_compile.precompile",
	"ts": 1747853586172499.2,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:06.176000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "15930f4d877c064892c05cbefbd66ea7"}
	{
	"name": "async_compile.precompile",
	"ts": 1747853586176623.0,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:06.201000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "c08871880d8078296768d682fad688c8"}
	{
	"name": "async_compile.precompile",
	"ts": 1747853586201183.2,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:06.204000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "edc8068b15430c43d3a513e375696657"}
	{
	"name": "async_compile.wait",
	"ts": 1747853586204261.0,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:06.206000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "df86ae93ef3a5155e4d6148e599b105f"}
	{
	"name": "async_compile.wait",
	"ts": 1747853586206457.2,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:06.211000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_logging/structured.py:27] {"str": ["/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/_inductor/graph.py", 20]}
V0521 11:53:06.213000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_logging/structured.py:27] {"str": ["/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/_inductor/codegen/cpp_wrapper_gpu.py", 21]}
V0521 11:53:06.214000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_logging/structured.py:27] {"str": ["/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/_inductor/codegen/cpp_wrapper_cpu.py", 22]}
V0521 11:53:06.215000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_logging/structured.py:27] {"str": ["/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/_inductor/codegen/wrapper.py", 23]}
V0521 11:53:06.216000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_logging/structured.py:27] {"str": ["<string>", 24]}
V0521 11:53:06.218000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_logging/structured.py:27] {"str": ["/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/_inductor/async_compile.py", 25]}
V0521 11:53:06.219000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_inductor/async_compile.py:110] {"artifact": {"name": "triton_kernel_info", "encoding": "json"}, "stack": [{"line": 38, "name": "<module>", "filename": 0, "loc": "__invoke_main()"}, {"line": 35, "name": "__invoke_main", "filename": 0, "loc": "run_as_main(module, main_function)"}, {"line": 98, "name": "run_as_main", "filename": 1, "loc": "oss_run_as_main("}, {"line": 94, "name": "run_as_main", "filename": 2, "loc": "main()"}, {"line": 59, "name": "main", "filename": 3, "loc": "package_path = torch._inductor.aoti_compile_and_package(ep)"}, {"line": 150, "name": "aoti_compile_and_package", "filename": 11, "loc": "return aot_inductor_minifier_wrapper("}, {"line": 929, "name": "aot_inductor_minifier_wrapper", "filename": 12, "loc": "return func("}, {"line": 193, "name": "_aoti_compile_and_package_inner", "filename": 11, "loc": "aoti_files = aot_compile(gm, args, kwargs, options=inductor_configs)"}, {"line": 300, "name": "aot_compile", "filename": 11, "loc": "return compile_fx_aot("}, {"line": 1745, "name": "compile_fx_aot", "filename": 13, "loc": "compiled_artifacts = compile_fx("}, {"line": 1943, "name": "compile_fx", "filename": 13, "loc": "return compile_fx("}, {"line": 2000, "name": "compile_fx", "filename": 13, "loc": "return compile_fx("}, {"line": 2343, "name": "compile_fx", "filename": 13, "loc": "return inference_compiler(unlifted_gm, example_inputs_)"}, {"line": 483, "name": "__call__", "filename": 14, "loc": "return self.compiler_fn(gm, example_inputs)"}, {"line": 2184, "name": "fw_compiler_base", "filename": 13, "loc": "return inner_compile("}, {"line": 81, "name": "inner", "filename": 17, "loc": "return func(*args, **kwds)"}, {"line": 710, "name": "compile_fx_inner", "filename": 13, "loc": "return wrap_compiler_debug(_compile_fx_inner, compiler_name=\"inductor\")("}, {"line": 124, "name": "debug_wrapper", "filename": 18, "loc": "inner_compiled_fn = compiler_fn(gm, example_inputs)"}, {"line": 167, "name": "newFunction", "filename": 19, "loc": "return old_func(*args, **kwargs)"}, {"line": 853, "name": "_compile_fx_inner", "filename": 13, "loc": "mb_compiled_graph = fx_codegen_and_compile("}, {"line": 1503, "name": "fx_codegen_and_compile", "filename": 13, "loc": "return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)"}, {"line": 1347, "name": "codegen_and_compile", "filename": 13, "loc": "wrapper_code, kernel_code = graph.codegen_with_cpp_wrapper()"}, {"line": 2180, "name": "codegen_with_cpp_wrapper", "filename": 20, "loc": "return self.codegen()"}, {"line": 2231, "name": "codegen", "filename": 20, "loc": "result = self.wrapper_code.generate(self.is_inference)"}, {"line": 340, "name": "generate", "filename": 21, "loc": "return super().generate(is_inference)"}, {"line": 953, "name": "generate", "filename": 22, "loc": "return super().generate(is_inference)"}, {"line": 1405, "name": "generate", "filename": 23, "loc": "return self._generate(is_inference)"}, {"line": 1468, "name": "_generate", "filename": 23, "loc": "self.generate_and_run_autotune_block()"}, {"line": 1546, "name": "generate_and_run_autotune_block", "filename": 23, "loc": "exec(tuning_code, scope)"}, {"line": 118, "name": "<module>", "filename": 24, "loc": ""}, {"line": 487, "name": "wait", "filename": 25, "loc": "_compile_end()"}, {"line": 110, "name": "_compile_end", "filename": 25, "loc": "torch._logging.trace_structured("}], "has_payload": "e8eda62380eb2e9a430c87ae70ab2901"}
	{"triton_poi_fused_addmm_gelu_2": {"autotune_cache_state": "miss", "num_configs": 2, "compile_time_us": 22357}, "triton_poi_fused_addmm_relu_sigmoid_0": {"autotune_cache_state": "only 1 config", "only_config": [["XBLOCK", 128], ["num_warps", 4], ["num_stages", 1]], "compile_time_us": 19836}, "triton_poi_fused_mul_1": {"autotune_cache_state": "miss", "num_configs": 2, "compile_time_us": 1701537}}
V0521 11:53:06.226000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "7c28cbc5785118fc27f1ae02202252f8"}
	{
	"name": "CachingAutotuner.benchmark_all_configs",
	"ts": 1747853586225889.8,
	"args": {
	"kernel_name": "triton_poi_fused_mul_1",
	"is_backward": false,
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:06.229000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "5cb5098a7d41c7066f5a3716f27a85e3"}
	{
	"name": "TritonBenchmarker.benchmark_gpu",
	"ts": 1747853586229060.2,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:06.246000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "f5b2a728f02cc71987de5443bd974b34"}
	{
	"name": "TritonBenchmarker.benchmark_gpu",
	"ts": 1747853586246617.2,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:06.250000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "4bb5bc539d87fd11e1c343edaca8af2c"}
	{
	"name": "TritonBenchmarker.benchmark_gpu",
	"ts": 1747853586250225.0,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:06.299000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "3091435814439c836e799326a69a27f6"}
	{
	"name": "TritonBenchmarker.benchmark_gpu",
	"ts": 1747853586299093.8,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:06.308000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "d5b85c401445c6bacc8d4cafa97dc8cf"}
	{
	"name": "CachingAutotuner.benchmark_all_configs",
	"ts": 1747853586308462.8,
	"args": {
	"kernel_name": "triton_poi_fused_mul_1",
	"is_backward": false,
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:06.321000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "14b213c20f1331c77ca53fb97deb7988"}
	{
	"name": "CachingAutotuner.benchmark_all_configs",
	"ts": 1747853586321279.0,
	"args": {
	"kernel_name": "triton_poi_fused_addmm_gelu_2",
	"is_backward": false,
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:06.324000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "79ef0436736bac28ab47d6143fa8422e"}
	{
	"name": "TritonBenchmarker.benchmark_gpu",
	"ts": 1747853586323935.0,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:06.391000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "1690a966318c1791b5278b18c240f7fe"}
	{
	"name": "TritonBenchmarker.benchmark_gpu",
	"ts": 1747853586390898.0,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:06.393000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "0aeb083f181993f27ea696a9bc064b5d"}
	{
	"name": "TritonBenchmarker.benchmark_gpu",
	"ts": 1747853586393676.8,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:06.462000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "198b577413dbb0e46b551e559a5ed111"}
	{
	"name": "TritonBenchmarker.benchmark_gpu",
	"ts": 1747853586462017.5,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:06.468000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "5651f777d0f987758e5c22656770f875"}
	{
	"name": "CachingAutotuner.benchmark_all_configs",
	"ts": 1747853586468060.8,
	"args": {
	"kernel_name": "triton_poi_fused_addmm_gelu_2",
	"is_backward": false,
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:06.481000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "87bf1e0996992bf59f8939bcda18eff8"}
	{
	"name": "PythonWrapperCodegen.generate",
	"ts": 1747853586481755.8,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:06.484000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "f29aea839719eb9e12badd763e23122f"}
	{
	"name": "CppWrapperCpu.generate",
	"ts": 1747853586484256.2,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:06.487000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "c0cda343fba2ca8ca5e9c97d4a137ae0"}
	{
	"name": "CppWrapperGpu.generate",
	"ts": 1747853586487096.0,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:06.489000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "967578d68b2a18698c7c5ca4154ba1ed"}
	{
	"name": "GraphLowering.codegen",
	"ts": 1747853586489532.2,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:06.492000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "5983f9ae07c63ebd87fcd361741eeec7"}
	{
	"name": "AotCodeCompiler.compile",
	"ts": 1747853586492007.0,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:14.269000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_logging/structured.py:27] {"str": ["/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/_inductor/codecache.py", 26]}
V0521 11:53:14.270000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_inductor/codecache.py:1650] {"graph_dump": {"name": "inductor_aot_wrapper_code", "type": "cpp", "filename": "/var/tmp/torchinductor_shangdiy/cwhkamk7hukdm5d55b4fxkyyok5x57mzbc2hzfy243x4xp2dcbtz/c4luzwr223piov46zdzgg4ihsoutvws2ipk5enbhr4ary57ghknl.wrapper.cpp"}, "stack": [{"line": 38, "name": "<module>", "filename": 0, "loc": "__invoke_main()"}, {"line": 35, "name": "__invoke_main", "filename": 0, "loc": "run_as_main(module, main_function)"}, {"line": 98, "name": "run_as_main", "filename": 1, "loc": "oss_run_as_main("}, {"line": 94, "name": "run_as_main", "filename": 2, "loc": "main()"}, {"line": 59, "name": "main", "filename": 3, "loc": "package_path = torch._inductor.aoti_compile_and_package(ep)"}, {"line": 150, "name": "aoti_compile_and_package", "filename": 11, "loc": "return aot_inductor_minifier_wrapper("}, {"line": 929, "name": "aot_inductor_minifier_wrapper", "filename": 12, "loc": "return func("}, {"line": 193, "name": "_aoti_compile_and_package_inner", "filename": 11, "loc": "aoti_files = aot_compile(gm, args, kwargs, options=inductor_configs)"}, {"line": 300, "name": "aot_compile", "filename": 11, "loc": "return compile_fx_aot("}, {"line": 1745, "name": "compile_fx_aot", "filename": 13, "loc": "compiled_artifacts = compile_fx("}, {"line": 1943, "name": "compile_fx", "filename": 13, "loc": "return compile_fx("}, {"line": 2000, "name": "compile_fx", "filename": 13, "loc": "return compile_fx("}, {"line": 2343, "name": "compile_fx", "filename": 13, "loc": "return inference_compiler(unlifted_gm, example_inputs_)"}, {"line": 483, "name": "__call__", "filename": 14, "loc": "return self.compiler_fn(gm, example_inputs)"}, {"line": 2184, "name": "fw_compiler_base", "filename": 13, "loc": "return inner_compile("}, {"line": 81, "name": "inner", "filename": 17, "loc": "return func(*args, **kwds)"}, {"line": 710, "name": "compile_fx_inner", "filename": 13, "loc": "return wrap_compiler_debug(_compile_fx_inner, compiler_name=\"inductor\")("}, {"line": 124, "name": "debug_wrapper", "filename": 18, "loc": "inner_compiled_fn = compiler_fn(gm, example_inputs)"}, {"line": 167, "name": "newFunction", "filename": 19, "loc": "return old_func(*args, **kwargs)"}, {"line": 853, "name": "_compile_fx_inner", "filename": 13, "loc": "mb_compiled_graph = fx_codegen_and_compile("}, {"line": 1503, "name": "fx_codegen_and_compile", "filename": 13, "loc": "return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)"}, {"line": 1372, "name": "codegen_and_compile", "filename": 13, "loc": "compiled_fn = AotCodeCompiler.compile("}, {"line": 1650, "name": "compile", "filename": 26, "loc": "trace_structured("}], "has_payload": "ac98c68a16cced9b0c5f46826042c1b9"}
	
	#include <torch/csrc/inductor/aoti_include/cuda.h>
	// Definition of AOTI runtime interface functions
	
	#include <torch/csrc/inductor/aoti_runtime/interface.h>
	#include <torch/csrc/inductor/aoti_runtime/model_container.h>
	
	#include <iostream>
	#include <vector>
	
	#define CONVERT_EXCEPTION_TO_ERROR_CODE(...)      \
	  try {                                           \
	    __VA_ARGS__                                   \
	  } catch (const std::exception& e) {             \
	    std::cerr << "Error: " << e.what() << '\n';   \
	    return AOTI_RUNTIME_FAILURE;                  \
	  } catch (...) {                                 \
	    std::cerr << "Unknown exception occurred.\n"; \
	    return AOTI_RUNTIME_FAILURE;                  \
	  }                                               \
	  return AOTI_RUNTIME_SUCCESS;
	
	#define AOTI_VECTOR_SIZE_CHECK(actual_size, expected_size, name)  \
	  do {                                                            \
	    AOTI_RUNTIME_CHECK(                                           \
	        actual_size == expected_size,                             \
	        "expected " + std::string(name) + " vector size to be " + \
	            std::to_string(expected_size) + ", but got " +        \
	            std::to_string(actual_size));                         \
	  } while (0)
	
	// AOTInductor uses at::addmm_out, which doesn't supports
	// arguments that requires gradient. For this reason, we
	// enforce no_grad context for run APIs.
	//
	// A RAII, thread local (!) guard that enables or disables grad mode upon
	// construction, and sets it back to the original value upon destruction.
	struct AOTINoGradGuard {
	  AOTINoGradGuard() {
	    aoti_torch_grad_mode_set_enabled(false);
	  }
	  AOTINoGradGuard(const AOTINoGradGuard&) = delete;
	  AOTINoGradGuard(AOTINoGradGuard&&) noexcept = delete;
	  ~AOTINoGradGuard() {
	    aoti_torch_grad_mode_set_enabled(prev_mode);
	  }
	  AOTINoGradGuard& operator=(const AOTINoGradGuard&) = delete;
	  AOTINoGradGuard& operator=(AOTINoGradGuard&&) noexcept = delete;
	  bool prev_mode{aoti_torch_grad_mode_is_enabled()};
	};
	
	extern "C" {
	
	AOTIRuntimeError AOTInductorModelContainerCreate(
	    AOTInductorModelContainerHandle* container_handle,
	    size_t num_models,
	    bool is_cpu,
	    const char* cubin_dir) {
	      return AOTInductorModelContainerCreateWithDevice(
	        container_handle,
	        num_models,
	        is_cpu ? "cpu" : "cuda",
	        cubin_dir);
	}
	
	AOTIRuntimeError AOTInductorModelContainerCreateWithDevice(
	    AOTInductorModelContainerHandle* container_handle,
	    size_t num_models,
	    const char* device_str,
	    const char* cubin_dir) {
	  if (num_models == 0) {
	    std::cerr << "Error: num_models must be positive, but got 0\n";
	    return AOTI_RUNTIME_FAILURE;
	  }
	  CONVERT_EXCEPTION_TO_ERROR_CODE({
	    std::optional<std::string> cubin_dir_opt;
	    if (cubin_dir != nullptr) {
	      cubin_dir_opt.emplace(cubin_dir);
	    }
	    auto* container = new torch::aot_inductor::AOTInductorModelContainer(
	        num_models, std::string(device_str), cubin_dir_opt);
	    *container_handle =
	        reinterpret_cast<AOTInductorModelContainerHandle>(container);
	  })
	}
	
	AOTIRuntimeError AOTInductorModelContainerDelete(
	    AOTInductorModelContainerHandle container_handle) {
	  CONVERT_EXCEPTION_TO_ERROR_CODE({
	    auto* container =
	        reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
	            container_handle);
	    delete container;
	  });
	}
	
	AOTIRuntimeError AOTInductorModelContainerRun(
	    AOTInductorModelContainerHandle container_handle,
	    AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles
	                                     // are stolen; the array itself is borrowed
	    size_t num_inputs,
	    AtenTensorHandle*
	        output_handles, // array for writing output AtenTensorHandle; handles
	                        // will be stolen by the caller; the array itself is
	                        // borrowed
	    size_t num_outputs,
	    AOTInductorStreamHandle stream_handle,
	    AOTIProxyExecutorHandle proxy_executor_handle) {
	  auto* container =
	      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
	          container_handle);
	  AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs");
	  AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs");
	
	  auto stream =
	      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
	  CONVERT_EXCEPTION_TO_ERROR_CODE({
	    AOTINoGradGuard guard;
	    container->run(
	        input_handles, output_handles, stream, proxy_executor_handle);
	  })
	}
	
	AOTIRuntimeError AOTInductorModelContainerRunSingleThreaded(
	    AOTInductorModelContainerHandle container_handle,
	    AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles
	                                     // are stolen; the array itself is borrowed
	    size_t num_inputs,
	    AtenTensorHandle*
	        output_handles, // array for writing output AtenTensorHandle; handles
	                        // will be stolen by the caller; the array itself is
	                        // borrowed
	    size_t num_outputs,
	    AOTInductorStreamHandle stream_handle,
	    AOTIProxyExecutorHandle proxy_executor_handle) {
	  auto* container =
	      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
	          container_handle);
	  AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs");
	  AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs");
	
	  auto stream =
	      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
	  CONVERT_EXCEPTION_TO_ERROR_CODE({
	    AOTINoGradGuard guard;
	    container->run_single_threaded(
	        input_handles, output_handles, stream, proxy_executor_handle);
	  })
	}
	
	AOTIRuntimeError AOTInductorModelContainerGetNumConstants(
	    AOTInductorModelContainerHandle container_handle,
	    size_t* num_constants) {
	  auto* container =
	      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
	          container_handle);
	  CONVERT_EXCEPTION_TO_ERROR_CODE(
	    { *num_constants = container->num_constants(); })
	}
	
	AOTIRuntimeError AOTInductorModelContainerGetConstantName(
	    AOTInductorModelContainerHandle container_handle,
	    size_t idx,
	    const char** name) {
	  auto* container =
	      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
	          container_handle);
	  CONVERT_EXCEPTION_TO_ERROR_CODE(
	    { *name = container->constant_name(idx); })
	}
	
	AOTIRuntimeError AOTInductorModelContainerGetConstantOriginalFQN(
	    AOTInductorModelContainerHandle container_handle,
	    size_t idx,
	    const char** original_fqn) {
	  auto* container =
	      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
	          container_handle);
	  CONVERT_EXCEPTION_TO_ERROR_CODE(
	    { *original_fqn = container->constant_original_fqn(idx); })
	}
	
	AOTIRuntimeError AOTInductorModelContainerGetConstantFromFolded(
	    AOTInductorModelContainerHandle container_handle,
	    size_t idx,
	    bool* from_folded) {
	  auto* container =
	      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(container_handle);
	  CONVERT_EXCEPTION_TO_ERROR_CODE({ *from_folded = container->constant_from_folded(idx); })
	}
	
	AOTIRuntimeError AOTInductorModelContainerGetConstantType(
	    AOTInductorModelContainerHandle container_handle,
	    size_t idx,
	    int32_t* type) {
	  auto* container =
	      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(container_handle);
	  CONVERT_EXCEPTION_TO_ERROR_CODE({ *type = container->constant_type(idx); })
	}
	
	AOTIRuntimeError AOTInductorModelContainerGetConstantDtype(
	    AOTInductorModelContainerHandle container_handle,
	    size_t idx,
	    int32_t* dtype) {
	  auto* container =
	      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
	          container_handle);
	  CONVERT_EXCEPTION_TO_ERROR_CODE(
	    { *dtype = container->constant_dtype(idx); })
	}
	
	AOTIRuntimeError AOTInductorModelContainerExtractConstantsMap(
	    AOTInductorModelContainerHandle container_handle,
	    AOTInductorConstantMapHandle constant_map_handle,
	    bool use_inactive) {
	  auto* container =
	      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
	          container_handle);
	  auto constants_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
	  CONVERT_EXCEPTION_TO_ERROR_CODE(
	    { const auto ret = container->extract_constants_map(use_inactive);
	      for (const auto& pair: ret) {
	        constants_map->emplace(pair.first, pair.second);
	      }
	    })
	}
	
	AOTIRuntimeError AOTInductorModelContainerUpdateUserManagedConstantBuffer(
	    AOTInductorModelContainerHandle container_handle,
	    AOTInductorConstantMapHandle constant_map_handle,
	    bool use_inactive,
	    bool validate_full_update) {
	  auto* container =
	      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
	          container_handle);
	  auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
	  CONVERT_EXCEPTION_TO_ERROR_CODE({
	    container->update_constant_buffer(
	        *input_map, use_inactive, validate_full_update, /* user_managed = */ true);
	  })
	}
	
	AOTIRuntimeError AOTInductorModelContainerUpdateConstantBuffer(
	    AOTInductorModelContainerHandle container_handle,
	    AOTInductorConstantMapHandle constant_map_handle,
	    bool use_inactive,
	    bool validate_full_update) {
	  auto* container =
	      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
	          container_handle);
	  auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
	  CONVERT_EXCEPTION_TO_ERROR_CODE({
	    container->update_constant_buffer(
	        *input_map, use_inactive, validate_full_update);
	  })
	}
	
	AOTIRuntimeError AOTInductorModelContainerUpdateInactiveConstantBuffer(
	    AOTInductorModelContainerHandle container_handle,
	    AOTInductorConstantMapHandle constant_map_handle) {
	  return AOTInductorModelContainerUpdateConstantBuffer(container_handle,
	          constant_map_handle,
	          /*use_inactive*/ true,
	          /*validate_full_update*/ true);
	}
	
	AOTIRuntimeError AOTInductorModelContainerFreeInactiveConstantBuffer(
	    AOTInductorModelContainerHandle container_handle) {
	  auto* container =
	      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
	          container_handle);
	  CONVERT_EXCEPTION_TO_ERROR_CODE({
	    container->free_inactive_constant_buffer();
	  })
	}
	
	AOTIRuntimeError AOTInductorModelContainerRunConstantFolding(
	    AOTInductorModelContainerHandle container_handle,
	    bool use_inactive,
	    AOTInductorStreamHandle stream_handle,
	    AOTIProxyExecutorHandle proxy_executor_handle) {
	  auto* container =
	      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
	          container_handle);
	  auto stream =
	      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
	  CONVERT_EXCEPTION_TO_ERROR_CODE({
	    AOTINoGradGuard guard;
	    container->run_const_fold(use_inactive, stream, proxy_executor_handle);
	  })
	}
	
	AOTIRuntimeError AOTInductorModelContainerSwapConstantBuffer(
	    AOTInductorModelContainerHandle container_handle) {
	  auto* container =
	      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
	          container_handle);
	  CONVERT_EXCEPTION_TO_ERROR_CODE({
	    container->swap_constant_buffer();
	  })
	}
	
	AOTIRuntimeError AOTInductorModelContainerGetNumInputs(
	    AOTInductorModelContainerHandle container_handle,
	    size_t* ret_num_inputs) {
	  auto* container =
	      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
	          container_handle);
	  CONVERT_EXCEPTION_TO_ERROR_CODE(
	      { *ret_num_inputs = container->num_inputs(); })
	}
	
	AOTIRuntimeError AOTInductorModelContainerGetInputName(
	    AOTInductorModelContainerHandle container_handle,
	    size_t input_idx,
	    const char** ret_input_names) {
	  auto* container =
	      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
	          container_handle);
	  CONVERT_EXCEPTION_TO_ERROR_CODE(
	      { *ret_input_names = container->input_name(input_idx); })
	}
	
	AOTIRuntimeError AOTInductorModelContainerGetNumOutputs(
	    AOTInductorModelContainerHandle container_handle,
	    size_t* ret_num_outputs) {
	  auto* container =
	      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
	          container_handle);
	  CONVERT_EXCEPTION_TO_ERROR_CODE(
	      { *ret_num_outputs = container->num_outputs(); })
	}
	
	AOTIRuntimeError AOTInductorModelContainerGetOutputName(
	    AOTInductorModelContainerHandle container_handle,
	    size_t output_idx,
	    const char** ret_output_names) {
	  auto* container =
	      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
	          container_handle);
	  CONVERT_EXCEPTION_TO_ERROR_CODE(
	      { *ret_output_names = container->output_name(output_idx); })
	}
	
	AOTIRuntimeError AOTInductorModelContainerGetCallSpec(
	    AOTInductorModelContainerHandle container_handle,
	    const char** in_spec,
	    const char** out_spec) {
	  auto* container =
	      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
	          container_handle);
	  CONVERT_EXCEPTION_TO_ERROR_CODE({
	    *in_spec = container->get_in_spec();
	    *out_spec = container->get_out_spec();
	  })
	}
	
	AOTIRuntimeError AOTInductorModelCreate(
	    AOTInductorModelHandle* model_handle,
	    AOTInductorConstantMapHandle constant_map_handle){
	    CONVERT_EXCEPTION_TO_ERROR_CODE({
	      auto constant_map = std::make_shared<torch::aot_inductor::ConstantMap>();
	      auto constant_array = std::make_shared<std::vector<torch::aot_inductor::ConstantHandle>>();
	      auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
	
	      auto model = new torch::aot_inductor::AOTInductorModel(
	          constant_map,
	          constant_array,
	          "cpu", // device_str is hardcoded, as AOTInductorModelCreate is only use for CPU models
	          ""
	      );
	
	      if (input_map) {
	        for (auto const& kv : *input_map) {
	          constant_map->emplace(kv.first, kv.second);
	        }
	      } else {
	        model->load_constants();
	      }
	
	      *model_handle = reinterpret_cast<AOTInductorModelHandle>(model);
	    })}
	
	AOTIRuntimeError AOTInductorModelRun(
	    AOTInductorModelHandle model_handle,
	    AtenTensorHandle* input_handles,
	    AtenTensorHandle* output_handles) {
	  auto model =
	      reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
	  CONVERT_EXCEPTION_TO_ERROR_CODE({
	    AOTINoGradGuard guard;
	    model->run_impl(
	        input_handles,
	        output_handles,
	        (torch::aot_inductor::DeviceStreamType) nullptr,
	        nullptr);
	  })
	}
	
	AOTIRuntimeError AOTInductorModelDelete(AOTInductorModelHandle model_handle){
	    CONVERT_EXCEPTION_TO_ERROR_CODE({
	      auto model = reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(
	          model_handle);
	      delete model;
	    })}
	
	AOTIRuntimeError AOTInductorModelGetNumOutputs(
	    AOTInductorModelHandle model_handle,
	    size_t* ret_num_outputs) {
	  CONVERT_EXCEPTION_TO_ERROR_CODE({
	      auto model = reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
	      *ret_num_outputs = model->num_outputs();
	  })
	}
	
	AOTIRuntimeError AOTInductorModelUpdateConstantsMap(
	    AOTInductorModelHandle model_handle,
	    AOTInductorConstantMapHandle constant_map_handle) {
	  auto model =
	      reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
	  CONVERT_EXCEPTION_TO_ERROR_CODE({
	    auto constant_map = std::make_shared<torch::aot_inductor::ConstantMap>();
	    auto input_map =
	        reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(
	            constant_map_handle);
	
	    for (auto const& kv : *input_map) {
	      constant_map->emplace(kv.first, kv.second);
	    }
	    model->update_constants_map(std::move(constant_map));
	  })
	}
	
	} // extern "C"
	
	
	#define CUDA_DRIVER_CHECK(EXPR)                    \
	do {                                               \
	    CUresult code = EXPR;                          \
	    const char *msg;                               \
	    CUresult code_get_error = cuGetErrorString(code, &msg); \
	    if (code_get_error != CUDA_SUCCESS) {          \
	        throw std::runtime_error(                  \
	            std::string("CUDA driver error: ") +   \
	            std::string("invalid error code!"));   \
	    }                                              \
	    if (code != CUDA_SUCCESS) {                    \
	        throw std::runtime_error(                  \
	            std::string("CUDA driver error: ") +   \
	            std::string(msg));                     \
	    }                                              \
	} while (0);
	
	static inline CUfunction loadKernel(
	        std::string filePath,
	        const std::string &funcName,
	        uint32_t sharedMemBytes,
	        const std::optional<std::string> &cubinDir = std::nullopt) {
	    if (cubinDir) {
	        std::filesystem::path p1{*cubinDir};
	        std::filesystem::path p2{filePath};
	        filePath = (p1 / p2.filename()).string();
	    }
	
	    CUmodule mod;
	    CUfunction func;
	    CUDA_DRIVER_CHECK(cuModuleLoad(&mod, filePath.c_str()));
	    CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str()));
	    if (sharedMemBytes > 0) {
	        CUDA_DRIVER_CHECK(cuFuncSetAttribute(
	            func,
	            CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
	            sharedMemBytes
	        ))
	    }
	    return func;
	}
	
	static inline CUfunction loadKernel(const void* start, const std::string &funcName, uint32_t sharedMemBytes) {
	    CUmodule mod;
	    CUfunction func;
	    CUDA_DRIVER_CHECK(cuModuleLoadData(&mod, start));
	    CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str()));
	    if (sharedMemBytes > 0) {
	        CUDA_DRIVER_CHECK(cuFuncSetAttribute(
	            func,
	            CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
	            sharedMemBytes
	        ))
	    }
	    return func;
	}
	
	static inline void launchKernel(
	        CUfunction func,
	        uint32_t gridX,
	        uint32_t gridY,
	        uint32_t gridZ,
	        uint32_t numWarps,
	        uint32_t sharedMemBytes,
	        void* args[],
	        cudaStream_t stream) {
	    CUDA_DRIVER_CHECK(cuLaunchKernel(
	        func, gridX, gridY, gridZ, 32*numWarps, 1, 1, sharedMemBytes, stream, args, nullptr
	    ));
	}
	CACHE_TORCH_DTYPE(float32);
	CACHE_TORCH_DEVICE(cuda);
	CACHE_TORCH_LAYOUT(strided);
	namespace torch::aot_inductor {
	namespace {
	class AOTInductorModelKernels : public AOTInductorModelKernelsBase {
	  public:
	    CUfunction triton_poi_fused_addmm_gelu_2{nullptr};
	    CUfunction triton_poi_fused_addmm_relu_sigmoid_0{nullptr};
	    CUfunction triton_poi_fused_mul_1{nullptr};
	};
	}  // namespace
	
	
	
	AOTInductorModel::AOTInductorModel(std::shared_ptr<ConstantMap> constants_map,
	                                   std::shared_ptr<std::vector<ConstantHandle>> constants_array,
	                                   const std::string& device_str,
	                                   std::optional<std::string> cubin_dir,
	                                   bool include_weights)
	    : AOTInductorModelBase(4,
	                           2,
	                           2,
	                           device_str,
	                           std::move(cubin_dir),
	                           true) {
	    inputs_info_[0].name = "arg2_1";
	    inputs_info_[1].name = "arg3_1";
	    inputs_info_[2].name = "arg4_1";
	    inputs_info_[3].name = "arg5_1";
	    constants_info_[0].name = "fc1_weight";
	    constants_info_[0].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
	    constants_info_[0].offset = 0;
	    constants_info_[0].data_size = 640;
	    constants_info_[0].from_folded = false;
	    constants_info_[0].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
	    constants_info_[0].shape = {16, 10};
	    constants_info_[0].stride = {10, 1};
	    constants_info_[0].layout = static_cast<int32_t>(cached_torch_layout_strided);
	    constants_info_[0].original_fqn = "fc1.weight";
	    constants_info_[1].name = "fc1_bias";
	    constants_info_[1].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
	    constants_info_[1].offset = 0;
	    constants_info_[1].data_size = 64;
	    constants_info_[1].from_folded = false;
	    constants_info_[1].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
	    constants_info_[1].shape = {16};
	    constants_info_[1].stride = {1};
	    constants_info_[1].layout = static_cast<int32_t>(cached_torch_layout_strided);
	    constants_info_[1].original_fqn = "fc1.bias";
	    update_constants_map(std::move(constants_map));
	    update_constants_array(std::move(constants_array));
	    in_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}, {"type": null, "context": null, "children_spec": []}, {"type": null, "context": null, "children_spec": []}, {"type": null, "context": null, "children_spec": []}]}, {"type": "builtins.dict", "context": "[]", "children_spec": []}]}])";
	    out_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}, {"type": null, "context": null, "children_spec": []}]}])";
	    outputs_info_[0].name = "output0";
	    outputs_info_[1].name = "output1";
	    this->kernels_ = std::make_unique<AOTInductorModelKernels>();
	}
	
	std::unordered_map<std::string, AtenTensorHandle> AOTInductorModel::const_run_impl(
	    DeviceStreamType stream,
	    AOTIProxyExecutorHandle proxy_executor,
	    bool initialization
	) {
	
	    if (!initialization) {
	        std::cerr << "[WARNING] Calling constant_folding in model, but compiled with config: "
	                  << "aot_inductor.use_runtime_constant_folding=False\n";
	    }
	    return {};
	}
	} // namespace torch::aot_inductor
	using namespace torch::aot_inductor;
	
	template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename kernels_type_>
	static inline void call_triton_poi_fused_addmm_relu_sigmoid_0(
	    const in_out_ptr0_type_& in_out_ptr0,
	    const in_ptr0_type_& in_ptr0,
	    int64_t xnumel,
	    cudaStream_t stream_,
	    kernels_type_& kernels_,
	    const std::optional<std::string>& cubin_dir_ = std::nullopt
	){
	    /*
	    async_compile.triton('triton_poi_fused_addmm_relu_sigmoid_0', '''
	    import triton
	    import triton.language as tl
	    from triton.compiler.compiler import AttrsDescriptor
	
	    from torch._inductor.runtime import triton_helpers, triton_heuristics
	    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
	    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
	    triton_helpers.set_driver_to_gpu()
	
	    @triton_heuristics.pointwise(
	        size_hints={'x': 128}, 
	        filename=__file__,
	        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=108, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
	        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_addmm_relu_sigmoid_0', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 0, 'backend_hash': 'E60230F396CE2570D7443DBDAB412080A10F057AFC6CD7E25C3A1E6D2B5F30C6', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'is_fbcode': True},
	        min_elem_per_thread=0
	    )
	    @triton.jit
	    def triton_poi_fused_addmm_relu_sigmoid_0(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
	        xnumel = 128
	        xoffset = tl.program_id(0) * XBLOCK
	        xindex = xoffset + tl.arange(0, XBLOCK)[:]
	        xmask = xindex < xnumel
	        x2 = xindex
	        x0 = (xindex % 16)
	        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
	        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
	        tmp2 = tmp0 + tmp1
	        tmp3 = tl.full([1], 0, tl.int32)
	        tmp4 = triton_helpers.maximum(tmp3, tmp2)
	        tmp5 = tl.sigmoid(tmp4)
	        tl.store(in_out_ptr0 + (x2), tmp5, xmask)
	    ''', device_str='cuda')
	    */
	    uint32_t grid_0 = ((xnumel + (128 - 1)) / (128));
	    uint32_t grid_1 = 1;
	    uint32_t grid_2 = 1;
	    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
	    if (kernels_.triton_poi_fused_addmm_relu_sigmoid_0 == nullptr) {
	        kernels_.triton_poi_fused_addmm_relu_sigmoid_0 = loadKernel("/var/tmp/torchinductor_shangdiy/cwhkamk7hukdm5d55b4fxkyyok5x57mzbc2hzfy243x4xp2dcbtz/crliqflurl6xjuakbcqp2ef6xkzs4p6x3hf3i5h3fiac575ra3gr.cubin", "triton_poi_fused_addmm_relu_sigmoid_0", 0, cubin_dir_); 
	    }
	    CUdeviceptr var_0 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
	    CUdeviceptr var_1 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
	    int var_2 = xnumel;
	    void* kernel_args_[] = {&var_0, &var_1, &var_2};
	    launchKernel(kernels_.triton_poi_fused_addmm_relu_sigmoid_0, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
	}
	
	template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
	static inline void call_triton_poi_fused_mul_1(
	    const in_ptr0_type_& in_ptr0,
	    const out_ptr0_type_& out_ptr0,
	    int64_t xnumel,
	    cudaStream_t stream_,
	    kernels_type_& kernels_,
	    const std::optional<std::string>& cubin_dir_ = std::nullopt
	){
	    /*
	    async_compile.triton('triton_poi_fused_mul_1', '''
	    import triton
	    import triton.language as tl
	    from triton.compiler.compiler import AttrsDescriptor
	
	    from torch._inductor.runtime import triton_helpers, triton_heuristics
	    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
	    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
	    triton_helpers.set_driver_to_gpu()
	
	    @triton_heuristics.pointwise(
	        size_hints={'x': 256}, 
	        filename=__file__,
	        triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=108, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
	        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'E60230F396CE2570D7443DBDAB412080A10F057AFC6CD7E25C3A1E6D2B5F30C6', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'is_fbcode': True},
	        min_elem_per_thread=0
	    )
	    @triton.jit
	    def triton_poi_fused_mul_1(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
	        xnumel = 200
	        xoffset = tl.program_id(0) * XBLOCK
	        xindex = xoffset + tl.arange(0, XBLOCK)[:]
	        xmask = xindex < xnumel
	        x0 = xindex
	        tmp0 = tl.load(in_ptr0 + (x0), xmask)
	        tmp1 = 3.14
	        tmp2 = tmp0 * tmp1
	        tl.store(out_ptr0 + (x0), tmp2, xmask)
	    ''', device_str='cuda')
	    */
	    uint32_t grid_0 = ((xnumel + (128 - 1)) / (128));
	    uint32_t grid_1 = 1;
	    uint32_t grid_2 = 1;
	    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
	    if (kernels_.triton_poi_fused_mul_1 == nullptr) {
	        kernels_.triton_poi_fused_mul_1 = loadKernel("/var/tmp/torchinductor_shangdiy/cwhkamk7hukdm5d55b4fxkyyok5x57mzbc2hzfy243x4xp2dcbtz/cwrpxdtwaatre4mev6pwrnq4q2gasssvaruuiamljpbzklliaxl7.cubin", "triton_poi_fused_mul_1", 0, cubin_dir_); 
	    }
	    CUdeviceptr var_4 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
	    CUdeviceptr var_5 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
	    int var_6 = xnumel;
	    void* kernel_args_[] = {&var_4, &var_5, &var_6};
	    launchKernel(kernels_.triton_poi_fused_mul_1, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
	}
	
	template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename kernels_type_>
	static inline void call_triton_poi_fused_addmm_gelu_2(
	    const in_out_ptr0_type_& in_out_ptr0,
	    const in_ptr0_type_& in_ptr0,
	    int64_t xnumel,
	    cudaStream_t stream_,
	    kernels_type_& kernels_,
	    const std::optional<std::string>& cubin_dir_ = std::nullopt
	){
	    /*
	    async_compile.triton('triton_poi_fused_addmm_gelu_2', '''
	    import triton
	    import triton.language as tl
	    from triton.compiler.compiler import AttrsDescriptor
	
	    from torch._inductor.runtime import triton_helpers, triton_heuristics
	    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
	    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
	    triton_helpers.set_driver_to_gpu()
	
	    @triton_heuristics.pointwise(
	        size_hints={'x': 512}, 
	        filename=__file__,
	        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=108, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
	        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_addmm_gelu_2', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 0, 'backend_hash': 'E60230F396CE2570D7443DBDAB412080A10F057AFC6CD7E25C3A1E6D2B5F30C6', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'is_fbcode': True},
	        min_elem_per_thread=0
	    )
	    @triton.jit
	    def triton_poi_fused_addmm_gelu_2(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
	        xnumel = 300
	        xoffset = tl.program_id(0) * XBLOCK
	        xindex = xoffset + tl.arange(0, XBLOCK)[:]
	        xmask = xindex < xnumel
	        x0 = xindex
	        tmp0 = tl.load(in_out_ptr0 + (x0), xmask)
	        tmp1 = tl.load(in_ptr0 + (x0), xmask)
	        tmp2 = tmp0 + tmp1
	        tmp3 = 0.5
	        tmp4 = tmp2 * tmp3
	        tmp5 = 0.7071067811865476
	        tmp6 = tmp2 * tmp5
	        tmp7 = libdevice.erf(tmp6)
	        tmp8 = 1.0
	        tmp9 = tmp7 + tmp8
	        tmp10 = tmp4 * tmp9
	        tl.store(in_out_ptr0 + (x0), tmp10, xmask)
	    ''', device_str='cuda')
	    */
	    uint32_t grid_0 = ((xnumel + (256 - 1)) / (256));
	    uint32_t grid_1 = 1;
	    uint32_t grid_2 = 1;
	    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
	    if (kernels_.triton_poi_fused_addmm_gelu_2 == nullptr) {
	        kernels_.triton_poi_fused_addmm_gelu_2 = loadKernel("/var/tmp/torchinductor_shangdiy/cwhkamk7hukdm5d55b4fxkyyok5x57mzbc2hzfy243x4xp2dcbtz/cvv47bk7x7ywwor7iem26qvrtir22j6is762ktgzvsltf3cf3ebs.cubin", "triton_poi_fused_addmm_gelu_2", 0, cubin_dir_); 
	    }
	    CUdeviceptr var_8 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
	    CUdeviceptr var_9 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
	    int var_10 = xnumel;
	    void* kernel_args_[] = {&var_8, &var_9, &var_10};
	    launchKernel(kernels_.triton_poi_fused_addmm_gelu_2, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
	}
	
	namespace torch::aot_inductor {
	
	void AOTInductorModel::_const_run_impl(
	    std::vector<AtenTensorHandle>& output_handles,
	    DeviceStreamType stream,
	    AOTIProxyExecutorHandle proxy_executor
	) {}
	
	AOTI_NOINLINE static void check_input_0(
	    AtenTensorHandle* input_handles
	) {
	    ConstantHandle arg2_1 = ConstantHandle(input_handles[0]);
	    int32_t arg2_1_dtype;
	    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_dtype(arg2_1, &arg2_1_dtype));
	
	    int32_t arg2_1_expected_dtype = aoti_torch_dtype_float32();
	    if (arg2_1_expected_dtype != arg2_1_dtype) {
	        std::stringstream ss;
	        ss << "input_handles[0]: unmatched dtype, "
	           << "expected: " << arg2_1_expected_dtype << "(at::kFloat), "
	           << "but got: " << arg2_1_dtype << "\n";
	        throw std::runtime_error(ss.str());
	    }
	    auto arg2_1_size = arg2_1.sizes();
	
	    if (8 != arg2_1_size[0]) {
	        std::stringstream ss;
	        ss << "input_handles[0]: unmatched dim value at 0, "
	           << "expected: 8, " << "but got: " << arg2_1_size[0]
	           << "\n";
	        throw std::runtime_error(ss.str());
	    }
	
	    if (10 != arg2_1_size[1]) {
	        std::stringstream ss;
	        ss << "input_handles[0]: unmatched dim value at 1, "
	           << "expected: 10, " << "but got: " << arg2_1_size[1]
	           << "\n";
	        throw std::runtime_error(ss.str());
	    }
	    auto arg2_1_stride = arg2_1.strides();
	
	    if (10 != arg2_1_stride[0]) {
	        std::stringstream ss;
	        ss << "input_handles[0]: unmatched stride value at 0, "
	           << "expected: 10, " << "but got: " << arg2_1_stride[0]
	           << "\n";
	        throw std::runtime_error(ss.str());
	    }
	
	    if (1 != arg2_1_stride[1]) {
	        std::stringstream ss;
	        ss << "input_handles[0]: unmatched stride value at 1, "
	           << "expected: 1, " << "but got: " << arg2_1_stride[1]
	           << "\n";
	        throw std::runtime_error(ss.str());
	    }
	    int32_t arg2_1_device_type;
	    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type(arg2_1, &arg2_1_device_type));
	
	    int32_t arg2_1_expected_device_type = 1;
	    if (arg2_1_expected_device_type != arg2_1_device_type) {
	        std::stringstream ss;
	        ss << "input_handles[0]: unmatched device type, "
	        << "expected: " << arg2_1_expected_device_type << "1(cuda), "
	        << "but got: " << arg2_1_device_type << "\n";
	        throw std::runtime_error(ss.str());
	    }
	}
	
	AOTI_NOINLINE static void check_input_1(
	    AtenTensorHandle* input_handles
	) {
	    ConstantHandle arg3_1 = ConstantHandle(input_handles[1]);
	    int32_t arg3_1_dtype;
	    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_dtype(arg3_1, &arg3_1_dtype));
	
	    int32_t arg3_1_expected_dtype = aoti_torch_dtype_float32();
	    if (arg3_1_expected_dtype != arg3_1_dtype) {
	        std::stringstream ss;
	        ss << "input_handles[1]: unmatched dtype, "
	           << "expected: " << arg3_1_expected_dtype << "(at::kFloat), "
	           << "but got: " << arg3_1_dtype << "\n";
	        throw std::runtime_error(ss.str());
	    }
	    auto arg3_1_size = arg3_1.sizes();
	
	    if (10 != arg3_1_size[0]) {
	        std::stringstream ss;
	        ss << "input_handles[1]: unmatched dim value at 0, "
	           << "expected: 10, " << "but got: " << arg3_1_size[0]
	           << "\n";
	        throw std::runtime_error(ss.str());
	    }
	
	    if (20 != arg3_1_size[1]) {
	        std::stringstream ss;
	        ss << "input_handles[1]: unmatched dim value at 1, "
	           << "expected: 20, " << "but got: " << arg3_1_size[1]
	           << "\n";
	        throw std::runtime_error(ss.str());
	    }
	    auto arg3_1_stride = arg3_1.strides();
	
	    if (20 != arg3_1_stride[0]) {
	        std::stringstream ss;
	        ss << "input_handles[1]: unmatched stride value at 0, "
	           << "expected: 20, " << "but got: " << arg3_1_stride[0]
	           << "\n";
	        throw std::runtime_error(ss.str());
	    }
	
	    if (1 != arg3_1_stride[1]) {
	        std::stringstream ss;
	        ss << "input_handles[1]: unmatched stride value at 1, "
	           << "expected: 1, " << "but got: " << arg3_1_stride[1]
	           << "\n";
	        throw std::runtime_error(ss.str());
	    }
	    int32_t arg3_1_device_type;
	    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type(arg3_1, &arg3_1_device_type));
	
	    int32_t arg3_1_expected_device_type = 1;
	    if (arg3_1_expected_device_type != arg3_1_device_type) {
	        std::stringstream ss;
	        ss << "input_handles[1]: unmatched device type, "
	        << "expected: " << arg3_1_expected_device_type << "1(cuda), "
	        << "but got: " << arg3_1_device_type << "\n";
	        throw std::runtime_error(ss.str());
	    }
	}
	
	AOTI_NOINLINE static void check_input_2(
	    AtenTensorHandle* input_handles
	) {
	    ConstantHandle arg4_1 = ConstantHandle(input_handles[2]);
	    int32_t arg4_1_dtype;
	    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_dtype(arg4_1, &arg4_1_dtype));
	
	    int32_t arg4_1_expected_dtype = aoti_torch_dtype_float32();
	    if (arg4_1_expected_dtype != arg4_1_dtype) {
	        std::stringstream ss;
	        ss << "input_handles[2]: unmatched dtype, "
	           << "expected: " << arg4_1_expected_dtype << "(at::kFloat), "
	           << "but got: " << arg4_1_dtype << "\n";
	        throw std::runtime_error(ss.str());
	    }
	    auto arg4_1_size = arg4_1.sizes();
	
	    if (20 != arg4_1_size[0]) {
	        std::stringstream ss;
	        ss << "input_handles[2]: unmatched dim value at 0, "
	           << "expected: 20, " << "but got: " << arg4_1_size[0]
	           << "\n";
	        throw std::runtime_error(ss.str());
	    }
	
	    if (30 != arg4_1_size[1]) {
	        std::stringstream ss;
	        ss << "input_handles[2]: unmatched dim value at 1, "
	           << "expected: 30, " << "but got: " << arg4_1_size[1]
	           << "\n";
	        throw std::runtime_error(ss.str());
	    }
	    auto arg4_1_stride = arg4_1.strides();
	
	    if (30 != arg4_1_stride[0]) {
	        std::stringstream ss;
	        ss << "input_handles[2]: unmatched stride value at 0, "
	           << "expected: 30, " << "but got: " << arg4_1_stride[0]
	           << "\n";
	        throw std::runtime_error(ss.str());
	    }
	
	    if (1 != arg4_1_stride[1]) {
	        std::stringstream ss;
	        ss << "input_handles[2]: unmatched stride value at 1, "
	           << "expected: 1, " << "but got: " << arg4_1_stride[1]
	           << "\n";
	        throw std::runtime_error(ss.str());
	    }
	    int32_t arg4_1_device_type;
	    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type(arg4_1, &arg4_1_device_type));
	
	    int32_t arg4_1_expected_device_type = 1;
	    if (arg4_1_expected_device_type != arg4_1_device_type) {
	        std::stringstream ss;
	        ss << "input_handles[2]: unmatched device type, "
	        << "expected: " << arg4_1_expected_device_type << "1(cuda), "
	        << "but got: " << arg4_1_device_type << "\n";
	        throw std::runtime_error(ss.str());
	    }
	}
	
	AOTI_NOINLINE static void check_input_3(
	    AtenTensorHandle* input_handles
	) {
	    ConstantHandle arg5_1 = ConstantHandle(input_handles[3]);
	    int32_t arg5_1_dtype;
	    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_dtype(arg5_1, &arg5_1_dtype));
	
	    int32_t arg5_1_expected_dtype = aoti_torch_dtype_float32();
	    if (arg5_1_expected_dtype != arg5_1_dtype) {
	        std::stringstream ss;
	        ss << "input_handles[3]: unmatched dtype, "
	           << "expected: " << arg5_1_expected_dtype << "(at::kFloat), "
	           << "but got: " << arg5_1_dtype << "\n";
	        throw std::runtime_error(ss.str());
	    }
	    auto arg5_1_size = arg5_1.sizes();
	
	    if (10 != arg5_1_size[0]) {
	        std::stringstream ss;
	        ss << "input_handles[3]: unmatched dim value at 0, "
	           << "expected: 10, " << "but got: " << arg5_1_size[0]
	           << "\n";
	        throw std::runtime_error(ss.str());
	    }
	
	    if (30 != arg5_1_size[1]) {
	        std::stringstream ss;
	        ss << "input_handles[3]: unmatched dim value at 1, "
	           << "expected: 30, " << "but got: " << arg5_1_size[1]
	           << "\n";
	        throw std::runtime_error(ss.str());
	    }
	    auto arg5_1_stride = arg5_1.strides();
	
	    if (30 != arg5_1_stride[0]) {
	        std::stringstream ss;
	        ss << "input_handles[3]: unmatched stride value at 0, "
	           << "expected: 30, " << "but got: " << arg5_1_stride[0]
	           << "\n";
	        throw std::runtime_error(ss.str());
	    }
	
	    if (1 != arg5_1_stride[1]) {
	        std::stringstream ss;
	        ss << "input_handles[3]: unmatched stride value at 1, "
	           << "expected: 1, " << "but got: " << arg5_1_stride[1]
	           << "\n";
	        throw std::runtime_error(ss.str());
	    }
	    int32_t arg5_1_device_type;
	    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type(arg5_1, &arg5_1_device_type));
	
	    int32_t arg5_1_expected_device_type = 1;
	    if (arg5_1_expected_device_type != arg5_1_device_type) {
	        std::stringstream ss;
	        ss << "input_handles[3]: unmatched device type, "
	        << "expected: " << arg5_1_expected_device_type << "1(cuda), "
	        << "but got: " << arg5_1_device_type << "\n";
	        throw std::runtime_error(ss.str());
	    }
	}
	
	static bool _check_aoti_runtime_check_inputs_env() {
	    const static char* env_var_value = getenv("AOTI_RUNTIME_CHECK_INPUTS");
	    const static bool result = env_var_value != nullptr && env_var_value[0] != '0';
	    return result;
	}
	
	AOTI_NOINLINE static void __check_inputs_outputs(
	    AtenTensorHandle* input_handles,
	    AtenTensorHandle* output_handles) {
	    if (!_check_aoti_runtime_check_inputs_env()){
	        return;
	    }
	    check_input_0(input_handles);
	    check_input_1(input_handles);
	    check_input_2(input_handles);
	    check_input_3(input_handles);
	}
	
	void AOTInductorModel::run_impl(
	    AtenTensorHandle*
	        input_handles, // array of input AtenTensorHandle; handles
	                        // are stolen; the array itself is borrowed
	    AtenTensorHandle*
	        output_handles, // array for writing output AtenTensorHandle; handles
	                        // will be stolen by the caller; the array itself is
	                        // borrowed
	    DeviceStreamType stream,
	    AOTIProxyExecutorHandle proxy_executor
	) {
	    __check_inputs_outputs(input_handles, output_handles);
	
	    auto inputs = steal_from_raw_handles_to_raii_handles(input_handles, 4);
	    auto arg2_1 = std::move(inputs[0]);
	    auto arg3_1 = std::move(inputs[1]);
	    auto arg4_1 = std::move(inputs[2]);
	    auto arg5_1 = std::move(inputs[3]);
	    [[maybe_unused]] auto fc1_weight = constants_->at(0);
	    [[maybe_unused]] auto fc1_bias = constants_->at(1);
	    inputs.clear();
	    [[maybe_unused]] auto& kernels = static_cast<AOTInductorModelKernels&>(*this->kernels_.get());
	
	    AOTICudaStreamGuard stream_guard(stream, this->device_idx_);
	    static constexpr int64_t int_array_0[] = {8L, 16L};
	    static constexpr int64_t int_array_1[] = {16L, 1L};
	    AtenTensorHandle buf0_handle;
	    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(2, int_array_0, int_array_1, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf0_handle));
	    RAIIAtenTensorHandle buf0(buf0_handle);
	    // Topologically Sorted Source Nodes: [], Original ATen: [aten.addmm]
	    static constexpr int64_t int_array_2[] = {10L, 16L};
	    static constexpr int64_t int_array_3[] = {1L, 10L};
	    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_mm_out(buf0, arg2_1, wrap_with_raii_handle_if_needed(reinterpret_tensor_wrapper(fc1_weight, 2, int_array_2, int_array_3, 0L))));
	    arg2_1.reset();
	    auto buf1 = std::move(buf0);  // reuse
	    // Topologically Sorted Source Nodes: [add, relu, sigmoid], Original ATen: [aten.addmm, aten.relu, aten.sigmoid]
	    call_triton_poi_fused_addmm_relu_sigmoid_0(buf1, fc1_bias, 128L, stream, kernels, this->cubin_dir_);
	    static constexpr int64_t int_array_4[] = {10L, 20L};
	    static constexpr int64_t int_array_5[] = {20L, 1L};
	    AtenTensorHandle buf2_handle;
	    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(2, int_array_4, int_array_5, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf2_handle));
	    RAIIAtenTensorHandle buf2(buf2_handle);
	    // Topologically Sorted Source Nodes: [mul], Original ATen: [aten.mul]
	    call_triton_poi_fused_mul_1(arg3_1, buf2, 200L, stream, kernels, this->cubin_dir_);
	    arg3_1.reset();
	    static constexpr int64_t int_array_6[] = {10L, 30L};
	    static constexpr int64_t int_array_7[] = {30L, 1L};
	    AtenTensorHandle buf3_handle;
	    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(2, int_array_6, int_array_7, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf3_handle));
	    RAIIAtenTensorHandle buf3(buf3_handle);
	    // Topologically Sorted Source Nodes: [mul, ], Original ATen: [aten.mul, aten.addmm]
	    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_mm_out(buf3, buf2, arg4_1));
	    arg4_1.reset();
	    buf2.reset();
	    auto buf4 = std::move(buf3);  // reuse
	    // Topologically Sorted Source Nodes: [add, gelu], Original ATen: [aten.addmm, aten.gelu]
	    call_triton_poi_fused_addmm_gelu_2(buf4, arg5_1, 300L, stream, kernels, this->cubin_dir_);
	    arg5_1.reset();
	    output_handles[0] = buf1.release();
	    output_handles[1] = buf4.release();
	} // AOTInductorModel::run_impl
	} // namespace torch::aot_inductor
	
	
	
	
V0521 11:53:14.273000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_inductor/codecache.py:1659] {"graph_dump": {"name": "inductor_aot_kernel_code", "type": "cpp", "filename": "/var/tmp/torchinductor_shangdiy/cwhkamk7hukdm5d55b4fxkyyok5x57mzbc2hzfy243x4xp2dcbtz/couhil56rhottovifsnuv42kzkikcazbllt7rbrifipt6o2iajei.kernel.cpp"}, "stack": [{"line": 38, "name": "<module>", "filename": 0, "loc": "__invoke_main()"}, {"line": 35, "name": "__invoke_main", "filename": 0, "loc": "run_as_main(module, main_function)"}, {"line": 98, "name": "run_as_main", "filename": 1, "loc": "oss_run_as_main("}, {"line": 94, "name": "run_as_main", "filename": 2, "loc": "main()"}, {"line": 59, "name": "main", "filename": 3, "loc": "package_path = torch._inductor.aoti_compile_and_package(ep)"}, {"line": 150, "name": "aoti_compile_and_package", "filename": 11, "loc": "return aot_inductor_minifier_wrapper("}, {"line": 929, "name": "aot_inductor_minifier_wrapper", "filename": 12, "loc": "return func("}, {"line": 193, "name": "_aoti_compile_and_package_inner", "filename": 11, "loc": "aoti_files = aot_compile(gm, args, kwargs, options=inductor_configs)"}, {"line": 300, "name": "aot_compile", "filename": 11, "loc": "return compile_fx_aot("}, {"line": 1745, "name": "compile_fx_aot", "filename": 13, "loc": "compiled_artifacts = compile_fx("}, {"line": 1943, "name": "compile_fx", "filename": 13, "loc": "return compile_fx("}, {"line": 2000, "name": "compile_fx", "filename": 13, "loc": "return compile_fx("}, {"line": 2343, "name": "compile_fx", "filename": 13, "loc": "return inference_compiler(unlifted_gm, example_inputs_)"}, {"line": 483, "name": "__call__", "filename": 14, "loc": "return self.compiler_fn(gm, example_inputs)"}, {"line": 2184, "name": "fw_compiler_base", "filename": 13, "loc": "return inner_compile("}, {"line": 81, "name": "inner", "filename": 17, "loc": "return func(*args, **kwds)"}, {"line": 710, "name": "compile_fx_inner", "filename": 13, "loc": "return wrap_compiler_debug(_compile_fx_inner, compiler_name=\"inductor\")("}, {"line": 124, "name": "debug_wrapper", "filename": 18, "loc": "inner_compiled_fn = compiler_fn(gm, example_inputs)"}, {"line": 167, "name": "newFunction", "filename": 19, "loc": "return old_func(*args, **kwargs)"}, {"line": 853, "name": "_compile_fx_inner", "filename": 13, "loc": "mb_compiled_graph = fx_codegen_and_compile("}, {"line": 1503, "name": "fx_codegen_and_compile", "filename": 13, "loc": "return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)"}, {"line": 1372, "name": "codegen_and_compile", "filename": 13, "loc": "compiled_fn = AotCodeCompiler.compile("}, {"line": 1659, "name": "compile", "filename": 26, "loc": "trace_structured("}], "has_payload": "f759b06fed89430c9608ade4cca99362"}
	// Triton kernels are embedded as comments in /var/tmp/torchinductor_shangdiy/cwhkamk7hukdm5d55b4fxkyyok5x57mzbc2hzfy243x4xp2dcbtz/c4luzwr223piov46zdzgg4ihsoutvws2ipk5enbhr4ary57ghknl.wrapper.cpp
	
V0521 11:53:14.284000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "9643f6689a7019642805f9dd1f2748ac"}
	{
	"name": "compile_file",
	"ts": 1747853594284781.2,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:23.215000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "26535ffda296128b78ca7b81afd2ace7"}
	{
	"name": "compile_file",
	"ts": 1747853603215159.2,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:23.217000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "01172f9c9cc055ad4b4afb1138c2c131"}
	{
	"name": "compile_file",
	"ts": 1747853603217882.2,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:23.239000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "b3773181c4940cadd807808cf1dbaf74"}
	{
	"name": "compile_file",
	"ts": 1747853603239045.2,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:23.249000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "b3c3d08f603604eff4533e6d6116dbe2"}
	{
	"name": "compile_file",
	"ts": 1747853603249559.0,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:23.286000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "920e910fca59e8b46cea3dd7b2d1312f"}
	{
	"name": "compile_file",
	"ts": 1747853603286745.2,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:23.293000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "118ec3af6bc1b46ef2d37f66f6fb4d5f"}
	{
	"name": "compile_file",
	"ts": 1747853603293380.8,
	"args": {
	"compile_id": "None"
	},
	"ph": "B",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:23.603000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "f92d0859c99021669e74d227ddb4a8cc"}
	{
	"name": "compile_file",
	"ts": 1747853603603086.8,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:23.606000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "5aa3a2bb2db55f98ace19244086c4c58"}
	{
	"name": "AotCodeCompiler.compile",
	"ts": 1747853603606394.5,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:23.609000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "a03e703afec92ff9660d23d0bb9954c9"}
	{
	"name": "GraphLowering.compile_to_fn",
	"ts": 1747853603609434.8,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:23.614000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_logging/structured.py:27] {"str": ["/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/_dynamo/utils.py", 27]}
V0521 11:53:23.615000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1903] {"chromium_event": {}, "stack": [{"line": 38, "name": "<module>", "filename": 0, "loc": "__invoke_main()"}, {"line": 35, "name": "__invoke_main", "filename": 0, "loc": "run_as_main(module, main_function)"}, {"line": 98, "name": "run_as_main", "filename": 1, "loc": "oss_run_as_main("}, {"line": 94, "name": "run_as_main", "filename": 2, "loc": "main()"}, {"line": 59, "name": "main", "filename": 3, "loc": "package_path = torch._inductor.aoti_compile_and_package(ep)"}, {"line": 150, "name": "aoti_compile_and_package", "filename": 11, "loc": "return aot_inductor_minifier_wrapper("}, {"line": 929, "name": "aot_inductor_minifier_wrapper", "filename": 12, "loc": "return func("}, {"line": 193, "name": "_aoti_compile_and_package_inner", "filename": 11, "loc": "aoti_files = aot_compile(gm, args, kwargs, options=inductor_configs)"}, {"line": 300, "name": "aot_compile", "filename": 11, "loc": "return compile_fx_aot("}, {"line": 1745, "name": "compile_fx_aot", "filename": 13, "loc": "compiled_artifacts = compile_fx("}, {"line": 1943, "name": "compile_fx", "filename": 13, "loc": "return compile_fx("}, {"line": 2000, "name": "compile_fx", "filename": 13, "loc": "return compile_fx("}, {"line": 2343, "name": "compile_fx", "filename": 13, "loc": "return inference_compiler(unlifted_gm, example_inputs_)"}, {"line": 483, "name": "__call__", "filename": 14, "loc": "return self.compiler_fn(gm, example_inputs)"}, {"line": 2184, "name": "fw_compiler_base", "filename": 13, "loc": "return inner_compile("}, {"line": 81, "name": "inner", "filename": 17, "loc": "return func(*args, **kwds)"}, {"line": 710, "name": "compile_fx_inner", "filename": 13, "loc": "return wrap_compiler_debug(_compile_fx_inner, compiler_name=\"inductor\")("}, {"line": 124, "name": "debug_wrapper", "filename": 18, "loc": "inner_compiled_fn = compiler_fn(gm, example_inputs)"}, {"line": 167, "name": "newFunction", "filename": 19, "loc": "return old_func(*args, **kwargs)"}, {"line": 923, "name": "_compile_fx_inner", "filename": 13, "loc": "CompileEventLogger.instant("}, {"line": 584, "name": "instant", "filename": 27, "loc": "CompileEventLogger.log_instant_event("}, {"line": 378, "name": "log_instant_event", "filename": 27, "loc": "chromium_log.log_instant_event("}, {"line": 1903, "name": "log_instant_event", "filename": 27, "loc": "torch._logging.trace_structured("}], "has_payload": "1379331874f5ac773d5643841544eae5"}
	{
	"name": "fx_graph_cache_disabled",
	"ts": 1747853583883408.2,
	"args": {
	"compile_id": "None"
	},
	"ph": "i",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0,
	"s": "p"
	}
V0521 11:53:23.616000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "9f899b72a19d50290c5b2b4fe5ca5369"}
	{
	"name": "fx_codegen_and_compile",
	"ts": 1747853603616796.0,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:23.624000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_inductor/compile_fx.py:967] {"artifact": {"name": "inductor_triton_kernel_to_post_grad_nodes", "encoding": "json"}, "stack": [{"line": 38, "name": "<module>", "filename": 0, "loc": "__invoke_main()"}, {"line": 35, "name": "__invoke_main", "filename": 0, "loc": "run_as_main(module, main_function)"}, {"line": 98, "name": "run_as_main", "filename": 1, "loc": "oss_run_as_main("}, {"line": 94, "name": "run_as_main", "filename": 2, "loc": "main()"}, {"line": 59, "name": "main", "filename": 3, "loc": "package_path = torch._inductor.aoti_compile_and_package(ep)"}, {"line": 150, "name": "aoti_compile_and_package", "filename": 11, "loc": "return aot_inductor_minifier_wrapper("}, {"line": 929, "name": "aot_inductor_minifier_wrapper", "filename": 12, "loc": "return func("}, {"line": 193, "name": "_aoti_compile_and_package_inner", "filename": 11, "loc": "aoti_files = aot_compile(gm, args, kwargs, options=inductor_configs)"}, {"line": 300, "name": "aot_compile", "filename": 11, "loc": "return compile_fx_aot("}, {"line": 1745, "name": "compile_fx_aot", "filename": 13, "loc": "compiled_artifacts = compile_fx("}, {"line": 1943, "name": "compile_fx", "filename": 13, "loc": "return compile_fx("}, {"line": 2000, "name": "compile_fx", "filename": 13, "loc": "return compile_fx("}, {"line": 2343, "name": "compile_fx", "filename": 13, "loc": "return inference_compiler(unlifted_gm, example_inputs_)"}, {"line": 483, "name": "__call__", "filename": 14, "loc": "return self.compiler_fn(gm, example_inputs)"}, {"line": 2184, "name": "fw_compiler_base", "filename": 13, "loc": "return inner_compile("}, {"line": 81, "name": "inner", "filename": 17, "loc": "return func(*args, **kwds)"}, {"line": 710, "name": "compile_fx_inner", "filename": 13, "loc": "return wrap_compiler_debug(_compile_fx_inner, compiler_name=\"inductor\")("}, {"line": 124, "name": "debug_wrapper", "filename": 18, "loc": "inner_compiled_fn = compiler_fn(gm, example_inputs)"}, {"line": 167, "name": "newFunction", "filename": 19, "loc": "return old_func(*args, **kwargs)"}, {"line": 967, "name": "_compile_fx_inner", "filename": 13, "loc": "trace_structured("}], "has_payload": "cb474acf26f54f984894156d71cc0554"}
	{"triton_poi_fused_addmm_relu_sigmoid_0": ["sigmoid", "relu", "add_tensor_1"], "triton_poi_fused_mul_1": ["mul"], "triton_poi_fused_addmm_gelu_2": ["mul_3", "mul_1", "add_tensor", "add", "erf", "mul_2"], "aoti_torch_cuda_mm_out": ["mm_default_1", "mm_default", "mul"]}
V0521 11:53:23.626000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_inductor/compile_fx.py:975] {"artifact": {"name": "inductor_provenance_tracking_node_mappings", "encoding": "json"}, "stack": [{"line": 38, "name": "<module>", "filename": 0, "loc": "__invoke_main()"}, {"line": 35, "name": "__invoke_main", "filename": 0, "loc": "run_as_main(module, main_function)"}, {"line": 98, "name": "run_as_main", "filename": 1, "loc": "oss_run_as_main("}, {"line": 94, "name": "run_as_main", "filename": 2, "loc": "main()"}, {"line": 59, "name": "main", "filename": 3, "loc": "package_path = torch._inductor.aoti_compile_and_package(ep)"}, {"line": 150, "name": "aoti_compile_and_package", "filename": 11, "loc": "return aot_inductor_minifier_wrapper("}, {"line": 929, "name": "aot_inductor_minifier_wrapper", "filename": 12, "loc": "return func("}, {"line": 193, "name": "_aoti_compile_and_package_inner", "filename": 11, "loc": "aoti_files = aot_compile(gm, args, kwargs, options=inductor_configs)"}, {"line": 300, "name": "aot_compile", "filename": 11, "loc": "return compile_fx_aot("}, {"line": 1745, "name": "compile_fx_aot", "filename": 13, "loc": "compiled_artifacts = compile_fx("}, {"line": 1943, "name": "compile_fx", "filename": 13, "loc": "return compile_fx("}, {"line": 2000, "name": "compile_fx", "filename": 13, "loc": "return compile_fx("}, {"line": 2343, "name": "compile_fx", "filename": 13, "loc": "return inference_compiler(unlifted_gm, example_inputs_)"}, {"line": 483, "name": "__call__", "filename": 14, "loc": "return self.compiler_fn(gm, example_inputs)"}, {"line": 2184, "name": "fw_compiler_base", "filename": 13, "loc": "return inner_compile("}, {"line": 81, "name": "inner", "filename": 17, "loc": "return func(*args, **kwds)"}, {"line": 710, "name": "compile_fx_inner", "filename": 13, "loc": "return wrap_compiler_debug(_compile_fx_inner, compiler_name=\"inductor\")("}, {"line": 124, "name": "debug_wrapper", "filename": 18, "loc": "inner_compiled_fn = compiler_fn(gm, example_inputs)"}, {"line": 167, "name": "newFunction", "filename": 19, "loc": "return old_func(*args, **kwargs)"}, {"line": 975, "name": "_compile_fx_inner", "filename": 13, "loc": "trace_structured("}], "has_payload": "74230a48139d13e914b6b596b0fe17d0"}
	{"preToPost": {"linear": ["permute", "mm_default_1", "add_tensor_1"], "relu": ["relu"], "sigmoid": ["sigmoid"], "mul": ["mul"], "addmm": ["mm_default", "add_tensor"], "gelu": ["mul_1", "mul_2", "erf", "add", "mul_3"]}, "postToPre": {"permute": ["linear"], "mm_default_1": ["linear"], "add_tensor_1": ["linear"], "relu": ["relu"], "sigmoid": ["sigmoid"], "mul": ["mul"], "mm_default": ["addmm"], "add_tensor": ["addmm"], "mul_1": ["gelu"], "mul_2": ["gelu"], "erf": ["gelu"], "add": ["gelu"], "mul_3": ["gelu"]}, "cppCodeToPost": {"triton_poi_fused_addmm_relu_sigmoid_0": ["sigmoid", "relu", "add_tensor_1"], "triton_poi_fused_mul_1": ["mul"], "triton_poi_fused_addmm_gelu_2": ["mul_3", "mul_1", "add_tensor", "add", "erf", "mul_2"], "aoti_torch_cuda_mm_out": ["mm_default_1", "mm_default", "mul"]}, "postToCppCode": {"sigmoid": ["triton_poi_fused_addmm_relu_sigmoid_0"], "relu": ["triton_poi_fused_addmm_relu_sigmoid_0"], "add_tensor_1": ["triton_poi_fused_addmm_relu_sigmoid_0"], "mul": ["triton_poi_fused_mul_1", "aoti_torch_cuda_mm_out"], "mul_3": ["triton_poi_fused_addmm_gelu_2"], "mul_1": ["triton_poi_fused_addmm_gelu_2"], "add_tensor": ["triton_poi_fused_addmm_gelu_2"], "add": ["triton_poi_fused_addmm_gelu_2"], "erf": ["triton_poi_fused_addmm_gelu_2"], "mul_2": ["triton_poi_fused_addmm_gelu_2"], "mm_default_1": ["aoti_torch_cuda_mm_out"], "mm_default": ["aoti_torch_cuda_mm_out"]}}
V0521 11:53:23.640000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "ba6c09fc89491369ba2da542ddb7439d"}
	{
	"name": "inductor_compile",
	"ts": 1747853603640603.0,
	"args": {
	"fn_name": "compile_fx_inner",
	"compile_id": "None",
	"is_backward": false,
	"cache_state": "disabled",
	"cache_event_time": 1747853583883408217,
	"key": null,
	"components": null,
	"cache_bypass_reason": "cache not enabled",
	"remote_cache_enabled": true,
	"local_cache_enabled": true
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:23.644000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "42a9f6c09d02cb92457473e766021e78"}
	{
	"name": "compile_fx.<locals>.fw_compiler_base",
	"ts": 1747853603643951.5,
	"args": {
	"compile_id": "None"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
V0521 11:53:23.650000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_logging/structured.py:27] {"str": ["/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/torch/_dynamo/metrics_context.py", 28]}
V0521 11:53:23.651000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1565] {"compilation_metrics": {"compile_id": null, "frame_key": null, "co_name": null, "co_filename": null, "co_firstlineno": null, "cache_size": null, "accumulated_cache_size": null, "guard_count": null, "shape_env_guard_count": null, "graph_op_count": null, "graph_node_count": null, "graph_input_count": null, "start_time": 1747853582.686129, "entire_frame_compile_time_s": null, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": null, "compliant_custom_ops": null, "restart_reasons": null, "dynamo_time_before_restart_s": null, "has_guarded_code": null, "remote_cache_time_saved_s": null, "structured_logging_overhead_s": null, "config_suppress_errors": null, "config_inline_inbuilt_nn_modules": null, "specialize_float": null, "dynamo_config": "{\"_autograd_backward_strict_mode_conditional_banned_ops\": [\"stride\", \"storage_offset\", \"is_contiguous\"], \"_unsafe_skip_fsdp_module_guards\": false, \"accumulated_recompile_limit\": 256, \"allow_complex_guards_as_runtime_asserts\": false, \"allow_empty_graphs\": false, \"allow_ignore_mark_dynamic\": false, \"allow_rnn\": false, \"allow_unspec_int_on_nn_module\": false, \"allowed_functions_module_string_ignorelist\": [\"torch._decomp\", \"torch._prims\", \"torch._refs\", \"torch.distributions\", \"torch.testing\"], \"assume_static_by_default\": true, \"automatic_dynamic_local_pgo\": true, \"automatic_dynamic_remote_pgo\": null, \"automatic_dynamic_shapes\": true, \"automatic_dynamic_shapes_mark_as\": \"dynamic\", \"capture_autograd_function\": true, \"capture_dynamic_output_shape_ops\": false, \"capture_func_transforms\": true, \"capture_scalar_outputs\": false, \"capture_sparse_compute\": false, \"compiled_autograd\": false, \"compiled_autograd_kwargs_override\": {}, \"cprofile\": false, \"cudagraph_backend_keep_input_mutation\": false, \"cudagraph_backend_support_input_mutation\": false, \"dead_code_elimination\": true, \"disable\": false, \"do_not_emit_runtime_asserts\": false, \"dont_skip_tracing\": false, \"dynamic_shapes\": true, \"enable_compiler_collectives\": false, \"enable_cpp_framelocals_guard_eval\": true, \"enable_cpp_guard_manager\": true, \"enable_cpp_symbolic_shape_guards\": false, \"enable_faithful_generator_behavior\": true, \"enable_trace_contextlib\": true, \"enable_trace_unittest\": false, \"error_on_nested_fx_trace\": true, \"error_on_nested_jit_trace\": true, \"error_on_recompile\": false, \"fail_on_recompile_limit_hit\": false, \"fake_tensor_cache_crosscheck_enabled\": false, \"fake_tensor_cache_enabled\": true, \"fake_tensor_disable_inference_mode\": true, \"force_nn_module_property_static_shapes\": true, \"force_parameter_static_shapes\": true, \"force_unspec_int_unbacked_size_like_on_torchrec_kjt\": false, \"graph_deduplication_lint\": false, \"guard_nn_modules\": true, \"guard_nn_modules_using_dict_tags\": true, \"inline_inbuilt_nn_modules\": true, \"install_free_tensors\": false, \"issue_3_13_0_warning\": true, \"minimum_call_count\": 1, \"numpy_default_complex\": \"complex128\", \"numpy_default_float\": \"float64\", \"numpy_default_int\": \"int64\", \"only_allow_pt2_compliant_ops\": false, \"optimize_ddp\": true, \"optimize_ddp_lazy_compile\": false, \"prefer_deferred_runtime_asserts_over_guards\": false, \"prepare_freezing\": false, \"raise_on_ctx_manager_usage\": true, \"raise_on_unsafe_aot_autograd\": false, \"recompile_limit\": 8, \"record_compile_time_instruction_count\": false, \"replay_record_enabled\": false, \"report_guard_failures\": true, \"rewrite_assert_with_torch_assert\": true, \"run_gc_after_compile\": true, \"skip_code_recursive_on_recompile_limit_hit\": true, \"skip_fsdp_guards\": true, \"skip_fsdp_hooks\": true, \"skip_nnmodule_hook_guards\": true, \"skip_no_tensor_aliasing_guards_on_parameters\": true, \"skip_tensor_guards_with_matching_dict_tags\": true, \"skip_torchrec\": true, \"skipfiles_inline_module_allowlist\": {}, \"specialize_float\": false, \"specialize_int\": false, \"suppress_errors\": false, \"trace_numpy\": true, \"track_nodes_for_deduplication\": false, \"use_graph_deduplication\": false, \"use_lazy_graph_module\": true, \"use_numpy_random_stream\": false, \"verify_correctness\": false, \"wrap_top_frame\": false}", "is_forward": null, "num_triton_bundles": null, "remote_fx_graph_cache_get_time_ms": null, "remote_fx_graph_cache_put_time_ms": null, "start_time_us": 1747853582686129, "duration_us": null, "dynamo_cumulative_compile_time_us": null, "aot_autograd_cumulative_compile_time_us": null, "inductor_cumulative_compile_time_us": null, "inductor_code_gen_cumulative_compile_time_us": null, "triton_compile_time_us": null, "runtime_cudagraphify_time_us": null, "runtime_triton_autotune_time_us": null, "dynamo_compile_time_before_restart_us": null, "distributed_ephemeral_timeout_us": null, "structured_logging_overhead_us": null, "remote_fx_graph_cache_get_time_us": null, "remote_fx_graph_cache_put_time_us": null, "backward_cumulative_compile_time_us": null, "end_time_us": 1747853603646017, "pre_grad_pass_time_us": null, "post_grad_pass_time_us": null, "joint_graph_pass_time_us": null, "log_format_version": 3, "inductor_config": "{\"TYPE_CHECKING\": false, \"_cache_config_ignore_prefix\": [\"trace\", \"cuda.cutlass_dir\", \"worker_start_method\", \"compile_threads\", \"post_grad_custom_post_pass\", \"post_grad_custom_pre_pass\", \"always_complex_memory_overlap_TESTING_ONLY\"], \"_collective.auto_select\": false, \"_collective.one_shot_all_reduce_threshold_bytes\": 131072, \"_fuse_ddp_bucket_size\": 25, \"_fuse_ddp_communication\": false, \"_fuse_ddp_communication_passes\": [\"fuse_ddp_with_concat_op\", \"schedule_comm_wait\"], \"_micro_pipeline_tp\": false, \"_pre_fusion_custom_pass\": null, \"_profile_var\": \"\", \"_raise_error_for_testing\": false, \"_save_config_ignore\": [\"trace.upload_tar\", \"joint_custom_pre_pass\", \"joint_custom_post_pass\", \"pre_grad_custom_pass\", \"aot_inductor.repro_level\", \"aot_inductor.dump_aoti_minifier\", \"post_grad_custom_pre_pass\", \"post_grad_custom_post_pass\"], \"add_pre_grad_passes\": null, \"aggressive_fusion\": false, \"alignment_asserts\": false, \"allow_buffer_reuse\": true, \"always_complex_memory_overlap_TESTING_ONLY\": false, \"always_keep_tensor_constants\": false, \"annotate_training\": false, \"aot_inductor.allow_stack_allocation\": false, \"aot_inductor.compile_wrapper_opt_level\": \"O1\", \"aot_inductor.custom_op_libs\": [], \"aot_inductor.custom_ops_to_c_shims\": {}, \"aot_inductor.debug_compile\": false, \"aot_inductor.debug_intermediate_value_printer\": \"0\", \"aot_inductor.dump_aoti_minifier\": false, \"aot_inductor.embed_cubin\": false, \"aot_inductor.filtered_kernel_names\": null, \"aot_inductor.force_mmap_weights\": false, \"aot_inductor.metadata\": {\"AOTI_DEVICE_KEY\": \"cuda\"}, \"aot_inductor.output_path\": \"\", \"aot_inductor.package\": false, \"aot_inductor.package_constants_in_so\": true, \"aot_inductor.package_cpp_only\": false, \"aot_inductor.precompile_headers\": false, \"aot_inductor.presets\": {}, \"aot_inductor.raise_error_on_ignored_optimization\": true, \"aot_inductor.repro_level\": 2, \"aot_inductor.serialized_in_spec\": \"\", \"aot_inductor.serialized_out_spec\": \"\", \"aot_inductor.use_minimal_arrayref_interface\": false, \"aot_inductor.use_runtime_constant_folding\": false, \"assert_indirect_indexing\": true, \"assume_aligned_inputs\": false, \"assume_unaligned_fallback_output\": false, \"autoheuristic_collect\": \"\", \"autoheuristic_log_path\": \"DEFAULT\", \"autoheuristic_use\": \"mixed_mm\", \"autotune_fallback_to_aten\": false, \"autotune_in_subproc\": false, \"autotune_local_cache\": true, \"autotune_multi_device\": false, \"autotune_num_choices_displayed\": 10, \"autotune_remote_cache\": null, \"b2b_gemm_pass\": false, \"batch_fusion\": true, \"benchmark_combo_kernel\": false, \"benchmark_epilogue_fusion\": true, \"benchmark_fusion\": false, \"benchmark_harness\": true, \"benchmark_kernel\": false, \"bfloat16_atomic_adds_enabled\": true, \"bundle_triton_into_fx_graph_cache\": null, \"bundled_autotune_remote_cache\": null, \"bw_outputs_user_visible\": true, \"can_inplace_pad_graph_input\": false, \"check_stack_no_cycles_TESTING_ONLY\": false, \"combo_kernel_allow_mixed_sizes\": 1, \"combo_kernel_foreach_dynamic_shapes\": false, \"combo_kernels\": false, \"combo_kernels_autotune\": 1, \"comment_origin\": false, \"compile_threads\": 22, \"comprehensive_padding\": true, \"compute_all_bounds\": false, \"constant_and_index_propagation\": true, \"conv_1x1_as_mm\": false, \"coordinate_descent_check_all_directions\": false, \"coordinate_descent_search_radius\": 1, \"coordinate_descent_tuning\": false, \"cpp.cxx\": [null, \"g++\"], \"cpp.descriptive_names\": \"original_aten\", \"cpp.dynamic_threads\": false, \"cpp.enable_concat_linear\": false, \"cpp.enable_floating_point_contract_flag\": \"off\", \"cpp.enable_grouped_gemm_template\": false, \"cpp.enable_kernel_profile\": false, \"cpp.enable_loop_tail_vec\": true, \"cpp.enable_tiling_heuristics\": true, \"cpp.enable_unsafe_math_opt_flag\": false, \"cpp.fallback_scatter_reduce_sum\": true, \"cpp.gemm_cache_blocking\": null, \"cpp.gemm_max_k_slices\": 1, \"cpp.gemm_thread_factors\": null, \"cpp.inject_log1p_bug_TESTING_ONLY\": null, \"cpp.inject_relu_bug_TESTING_ONLY\": null, \"cpp.max_horizontal_fusion_size\": 16, \"cpp.min_chunk_size\": 4096, \"cpp.no_redundant_loops\": true, \"cpp.simdlen\": null, \"cpp.threads\": -1, \"cpp.use_decompose_tanh\": false, \"cpp.vec_isa_ok\": null, \"cpp.weight_prepack\": true, \"cpp_cache_precompile_headers\": false, \"cpp_wrapper\": false, \"cpu_backend\": \"cpp\", \"cuda.arch\": null, \"cuda.compile_opt_level\": \"-O1\", \"cuda.cuda_cxx\": null, \"cuda.cutlass_backend_min_gemm_size\": 1, \"cuda.cutlass_dir\": \"/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/b9ba6d289a870795/scripts/shangdiy/__aot__/aot#link-tree/third_party/cutlass\", \"cuda.cutlass_epilogue_fusion_enabled\": false, \"cuda.cutlass_hash_with_compile_cmd\": false, \"cuda.cutlass_instantiation_level\": \"0\", \"cuda.cutlass_max_profiling_configs\": null, \"cuda.cutlass_max_profiling_swizzle_options\": [1, 2, 4], \"cuda.cutlass_op_allowlist_regex\": null, \"cuda.cutlass_op_denylist_regex\": null, \"cuda.cutlass_presets\": null, \"cuda.cutlass_tma_only\": false, \"cuda.enable_cuda_lto\": false, \"cuda.enable_debug_info\": false, \"cuda.enable_ptxas_info\": false, \"cuda.generate_test_runner\": false, \"cuda.use_fast_math\": false, \"cuda.version\": null, \"cuda_backend\": \"triton\", \"dce\": false, \"debug\": false, \"debug_fusion\": false, \"debug_index_asserts\": false, \"debug_ir_traceback\": false, \"decompose_mem_bound_mm\": false, \"developer_warnings\": true, \"disable_cpp_codegen\": false, \"disable_padding_cpu\": true, \"disable_progress\": true, \"dynamic_scale_rblock\": true, \"efficient_conv_bn_eval_fx_passes\": false, \"emulate_precision_casts\": false, \"enable_auto_functionalized_v2\": true, \"enable_linear_binary_folding\": false, \"enabled_metric_tables\": \"\", \"epilogue_fusion\": true, \"epilogue_fusion_first\": false, \"estimate_op_runtime\": \"default\", \"external_matmul\": [], \"fallback_random\": false, \"force_disable_caches\": false, \"force_fuse_int_mm_with_mul\": false, \"force_layout_optimization\": false, \"force_pointwise_cat\": false, \"force_same_precision\": true, \"force_shape_pad\": false, \"freezing\": false, \"freezing_discard_parameters\": false, \"fx_graph_cache\": true, \"fx_graph_remote_cache\": null, \"fx_passes_numeric_check\": {\"num_iterations\": 1, \"pre_grad\": false, \"precision\": 0.0001, \"requires_optimizer\": true}, \"generate_intermediate_hooks\": false, \"global_cache_dir\": null, \"graph_partition\": false, \"group_fusion\": false, \"halide.asserts\": false, \"halide.cpu_target\": \"host\", \"halide.debug\": false, \"halide.gpu_target\": \"host-cuda\", \"halide.scan_kernels\": false, \"halide.scheduler_cpu\": \"Adams2019\", \"halide.scheduler_cuda\": \"Anderson2021\", \"implicit_fallbacks\": true, \"inplace_buffers\": true, \"inplace_padding\": true, \"inter_node_bw\": 25, \"intra_node_bw\": 300, \"is_nightly_or_source\": false, \"is_predispatch\": false, \"joint_custom_post_pass\": null, \"joint_custom_pre_pass\": null, \"joint_graph_constant_folding\": true, \"keep_output_stride\": true, \"kernel_name_max_ops\": 10, \"layout_opt_default\": \"1\", \"layout_optimization\": true, \"loop_ordering_after_fusion\": false, \"max_autotune\": false, \"max_autotune_conv_backends\": \"ATEN,TRITON\", \"max_autotune_gemm\": false, \"max_autotune_gemm_backends\": \"ATEN,TRITON,CPP\", \"max_autotune_gemm_search_space\": \"DEFAULT\", \"max_autotune_pointwise\": false, \"max_autotune_subproc_graceful_timeout_seconds\": 0.0, \"max_autotune_subproc_result_timeout_seconds\": 60.0, \"max_autotune_subproc_terminate_timeout_seconds\": 0.0, \"max_epilogue_benchmarked_choices\": 1, \"max_fusion_size\": 64, \"max_pointwise_cat_inputs\": 8, \"memory_planning\": false, \"memory_pool\": \"intermediates\", \"mixed_mm_choice\": \"heuristic\", \"nan_asserts\": false, \"non_blocking_remote_cache_write\": false, \"online_softmax\": true, \"optimize_scatter_upon_const_tensor\": true, \"pad_channels_last\": false, \"pad_outputs\": false, \"padding_alignment_bytes\": 128, \"padding_stride_threshold\": 1024, \"pattern_matcher\": true, \"permute_fusion\": false, \"pick_loop_orders\": true, \"post_grad_custom_post_pass\": null, \"post_grad_custom_pre_pass\": null, \"post_grad_fusion_options\": {}, \"pre_grad_custom_pass\": null, \"pre_grad_fusion_options\": {}, \"profile_bandwidth\": false, \"profile_bandwidth_output\": null, \"profile_bandwidth_regex\": \"\", \"profile_bandwidth_with_do_bench_using_profiling\": false, \"profiler_mark_wrapper_call\": false, \"prologue_fusion\": true, \"realize_acc_reads_threshold\": 8, \"realize_opcount_threshold\": 30, \"realize_reads_threshold\": 4, \"remove_pre_grad_passes\": null, \"reorder_for_compute_comm_overlap\": false, \"reorder_for_compute_comm_overlap_passes\": [\"reorder_compute_for_overlap\", \"sink_waits\", \"raise_comms\"], \"reorder_for_locality\": true, \"reorder_for_peak_memory\": true, \"reorder_prefetch_limit\": null, \"rocm.arch\": [], \"rocm.ck_dir\": null, \"rocm.ck_supported_arch\": [\"gfx90a\", \"gfx942\"], \"rocm.compile_opt_level\": \"-O2\", \"rocm.flush_denormals\": true, \"rocm.generate_test_runner\": false, \"rocm.is_debug\": false, \"rocm.kBatch_sweep\": null, \"rocm.n_max_profiling_configs\": null, \"rocm.print_kernel_resource_usage\": false, \"rocm.rocm_home\": null, \"rocm.save_temps\": false, \"rocm.split_k_threshold\": 16, \"rocm.use_fast_math\": true, \"rocm.use_preselected_instances\": false, \"save_args\": false, \"scalar_asserts\": true, \"score_fusion_memory_threshold\": 10, \"search_autotune_cache\": false, \"shape_padding\": true, \"size_asserts\": true, \"sleep_sec_TESTING_ONLY\": null, \"split_cat_fx_passes\": true, \"split_reductions\": true, \"static_weight_shapes\": true, \"strict_static_cuda_launcher\": false, \"test_configs.autotune_choice_desc_regex\": null, \"test_configs.autotune_choice_name_regex\": null, \"test_configs.force_extern_kernel_in_multi_template\": false, \"test_configs.graphsafe_rng_func_ignores_fallback_random\": false, \"test_configs.max_mm_configs\": null, \"test_configs.runtime_triton_dtype_assert\": false, \"test_configs.static_cpp_dtype_assert\": false, \"trace.compile_profile\": false, \"trace.debug_dir\": null, \"trace.debug_log\": false, \"trace.dot_graph_shape\": null, \"trace.draw_orig_fx_graph\": false, \"trace.enabled\": true, \"trace.fx_graph\": true, \"trace.fx_graph_transformed\": true, \"trace.graph_diagram\": false, \"trace.info_log\": false, \"trace.ir_post_fusion\": true, \"trace.ir_pre_fusion\": true, \"trace.log_autotuning_results\": false, \"trace.log_inductor_triton_kernel_to_post_grad_node_info\": true, \"trace.log_url_for_graph_xform\": null, \"trace.output_code\": true, \"trace.save_real_tensors\": false, \"trace.upload_tar\": null, \"triton.autotune_at_compile_time\": null, \"triton.autotune_cublasLt\": true, \"triton.autotune_pointwise\": true, \"triton.autotune_with_sample_inputs\": false, \"triton.codegen_upcast_to_fp32\": true, \"triton.cooperative_reductions\": false, \"triton.cudagraph_dynamic_shape_warn_limit\": 50, \"triton.cudagraph_skip_dynamic_graphs\": false, \"triton.cudagraph_support_input_mutation\": false, \"triton.cudagraph_trees\": true, \"triton.cudagraph_trees_history_recording\": false, \"triton.cudagraph_unexpected_rerecord_limit\": 128, \"triton.cudagraphs\": false, \"triton.debug_sync_graph\": false, \"triton.debug_sync_kernel\": false, \"triton.dense_indexing\": false, \"triton.descriptive_names\": \"original_aten\", \"triton.disallow_failing_autotune_kernels_TESTING_ONLY\": false, \"triton.divisible_by_16\": true, \"triton.enable_persistent_tma_matmul\": false, \"triton.fast_path_cudagraph_asserts\": false, \"triton.force_cooperative_reductions\": false, \"triton.force_cudagraph_sync\": false, \"triton.force_cudagraphs_warmup\": false, \"triton.inject_relu_bug_TESTING_ONLY\": null, \"triton.max_tiles\": 2, \"triton.min_split_scan_rblock\": 256, \"triton.multi_kernel\": 0, \"triton.persistent_reductions\": true, \"triton.prefer_nd_tiling\": false, \"triton.skip_cudagraph_warmup\": false, \"triton.skip_l1_cache\": false, \"triton.slow_path_cudagraph_asserts\": true, \"triton.spill_threshold\": 16, \"triton.store_cubin\": false, \"triton.tile_reductions\": false, \"triton.tiling_prevents_pointwise_fusion\": true, \"triton.tiling_prevents_reduction_fusion\": true, \"triton.unique_kernel_names\": true, \"triton.unique_user_kernel_names\": false, \"triton.use_block_ptr\": false, \"triton_kernel_default_layout_constraint\": \"needs_fixed_stride_order\", \"unbacked_symint_fallback\": 8192, \"unroll_reductions_threshold\": 8, \"unsafe_ignore_unsupported_triton_autotune_args\": false, \"unsafe_marked_cacheable_functions\": {}, \"unsafe_skip_cache_dynamic_shape_guards\": false, \"use_experimental_benchmarker\": false, \"use_fast_math\": false, \"use_mixed_mm\": true, \"use_static_cuda_launcher\": false, \"verbose_progress\": false, \"warn_mix_layout\": false, \"worker_start_method\": \"subprocess\"}", "remote_cache_version": 14, "inductor_fx_remote_cache_hit_count": null, "inductor_fx_remote_cache_miss_count": null, "inductor_fx_remote_cache_backend_type": "_ManifoldCache", "inductor_fx_remote_cache_hit_keys": null, "inductor_fx_remote_cache_miss_keys": null, "cuda_version": "12.4.0", "triton_version": "3.2.0", "feature_usage": {"fx_cache": false, "parallel_compile_post_warmup": false}, "compile_time_autotune_time_us": null, "is_runtime": false, "gc_time_us": null, "tensorify_float_attempt": null, "tensorify_float_success": null, "tensorify_float_failure": null, "guard_latency_us": null, "recompile_reason": null, "num_graph_breaks": 0, "triton_kernel_compile_times_us": "[[\"triton_poi_fused_mul_1\", 1701537], [\"triton_poi_fused_addmm_gelu_2\", 22357], [\"triton_poi_fused_addmm_relu_sigmoid_0\", 19836]]", "ir_count": null, "cudagraph_skip_reason": null, "python_version": "3.12.10+meta (3.12:0cc8128, Apr 08 2025, 11:35:30) [Clang 17.0.4 (mononoke://mononoke.internal.tfbnw.net/fbsource faab6f8b235019378", "pgo_put_remote_code_state_time_us": null, "pgo_get_remote_code_state_time_us": null, "param_numel": null, "param_bytes": null, "param_count": null}, "stack": [{"line": 38, "name": "<module>", "filename": 0, "loc": "__invoke_main()"}, {"line": 35, "name": "__invoke_main", "filename": 0, "loc": "run_as_main(module, main_function)"}, {"line": 98, "name": "run_as_main", "filename": 1, "loc": "oss_run_as_main("}, {"line": 94, "name": "run_as_main", "filename": 2, "loc": "main()"}, {"line": 59, "name": "main", "filename": 3, "loc": "package_path = torch._inductor.aoti_compile_and_package(ep)"}, {"line": 150, "name": "aoti_compile_and_package", "filename": 11, "loc": "return aot_inductor_minifier_wrapper("}, {"line": 929, "name": "aot_inductor_minifier_wrapper", "filename": 12, "loc": "return func("}, {"line": 193, "name": "_aoti_compile_and_package_inner", "filename": 11, "loc": "aoti_files = aot_compile(gm, args, kwargs, options=inductor_configs)"}, {"line": 300, "name": "aot_compile", "filename": 11, "loc": "return compile_fx_aot("}, {"line": 1743, "name": "compile_fx_aot", "filename": 13, "loc": "get_metrics_context(),"}, {"line": 93, "name": "__exit__", "filename": 28, "loc": "self._on_exit("}, {"line": 1565, "name": "record_compilation_metrics", "filename": 27, "loc": "torch._logging.trace_structured("}]}
V0521 11:53:23.658000 3076301 /data/users/shangdiy/fbsource/fbcode/caffe2/torch/_dynamo/utils.py:1864] {"chromium_event": {}, "has_payload": "cc64362319ca1e555ab8a33dc5e2c5bf"}
	{
	"name": "compile_fx_aot",
	"ts": 1747853603658815.8,
	"args": {
	"compile_id": "None",
	"num_graph_breaks": 0,
	"frame_key": null,
	"co_name": null,
	"co_filename": null,
	"co_firstlineno": null,
	"cache_size": null,
	"accumulated_cache_size": null,
	"guard_count": null,
	"shape_env_guard_count": null,
	"graph_op_count": null,
	"graph_node_count": null,
	"graph_input_count": null,
	"fail_type": null,
	"fail_reason": null,
	"fail_user_frame_filename": null,
	"fail_user_frame_lineno": null,
	"non_compliant_ops": null,
	"compliant_custom_ops": null,
	"restart_reasons": null,
	"dynamo_time_before_restart_s": null,
	"has_guarded_code": null,
	"dynamo_config": "{\"_autograd_backward_strict_mode_conditional_banned_ops\": [\"stride\", \"storage_offset\", \"is_contiguous\"], \"_unsafe_skip_fsdp_module_guards\": false, \"accumulated_recompile_limit\": 256, \"allow_complex_guards_as_runtime_asserts\": false, \"allow_empty_graphs\": false, \"allow_ignore_mark_dynamic\": false, \"allow_rnn\": false, \"allow_unspec_int_on_nn_module\": false, \"allowed_functions_module_string_ignorelist\": [\"torch._decomp\", \"torch._prims\", \"torch._refs\", \"torch.distributions\", \"torch.testing\"], \"assume_static_by_default\": true, \"automatic_dynamic_local_pgo\": true, \"automatic_dynamic_remote_pgo\": null, \"automatic_dynamic_shapes\": true, \"automatic_dynamic_shapes_mark_as\": \"dynamic\", \"capture_autograd_function\": true, \"capture_dynamic_output_shape_ops\": false, \"capture_func_transforms\": true, \"capture_scalar_outputs\": false, \"capture_sparse_compute\": false, \"compiled_autograd\": false, \"compiled_autograd_kwargs_override\": {}, \"cprofile\": false, \"cudagraph_backend_keep_input_mutation\": false, \"cudagraph_backend_support_input_mutation\": false, \"dead_code_elimination\": true, \"disable\": false, \"do_not_emit_runtime_asserts\": false, \"dont_skip_tracing\": false, \"dynamic_shapes\": true, \"enable_compiler_collectives\": false, \"enable_cpp_framelocals_guard_eval\": true, \"enable_cpp_guard_manager\": true, \"enable_cpp_symbolic_shape_guards\": false, \"enable_faithful_generator_behavior\": true, \"enable_trace_contextlib\": true, \"enable_trace_unittest\": false, \"error_on_nested_fx_trace\": true, \"error_on_nested_jit_trace\": true, \"error_on_recompile\": false, \"fail_on_recompile_limit_hit\": false, \"fake_tensor_cache_crosscheck_enabled\": false, \"fake_tensor_cache_enabled\": true, \"fake_tensor_disable_inference_mode\": true, \"force_nn_module_property_static_shapes\": true, \"force_parameter_static_shapes\": true, \"force_unspec_int_unbacked_size_like_on_torchrec_kjt\": false, \"graph_deduplication_lint\": false, \"guard_nn_modules\": true, \"guard_nn_modules_using_dict_tags\": true, \"inline_inbuilt_nn_modules\": true, \"install_free_tensors\": false, \"issue_3_13_0_warning\": true, \"minimum_call_count\": 1, \"numpy_default_complex\": \"complex128\", \"numpy_default_float\": \"float64\", \"numpy_default_int\": \"int64\", \"only_allow_pt2_compliant_ops\": false, \"optimize_ddp\": true, \"optimize_ddp_lazy_compile\": false, \"prefer_deferred_runtime_asserts_over_guards\": false, \"prepare_freezing\": false, \"raise_on_ctx_manager_usage\": true, \"raise_on_unsafe_aot_autograd\": false, \"recompile_limit\": 8, \"record_compile_time_instruction_count\": false, \"replay_record_enabled\": false, \"report_guard_failures\": true, \"rewrite_assert_with_torch_assert\": true, \"run_gc_after_compile\": true, \"skip_code_recursive_on_recompile_limit_hit\": true, \"skip_fsdp_guards\": true, \"skip_fsdp_hooks\": true, \"skip_nnmodule_hook_guards\": true, \"skip_no_tensor_aliasing_guards_on_parameters\": true, \"skip_tensor_guards_with_matching_dict_tags\": true, \"skip_torchrec\": true, \"skipfiles_inline_module_allowlist\": {}, \"specialize_float\": false, \"specialize_int\": false, \"suppress_errors\": false, \"trace_numpy\": true, \"track_nodes_for_deduplication\": false, \"use_graph_deduplication\": false, \"use_lazy_graph_module\": true, \"use_numpy_random_stream\": false, \"verify_correctness\": false, \"wrap_top_frame\": false}"
	},
	"ph": "E",
	"cat": "dynamo_timed",
	"tid": 0,
	"pid": 0
	}
