-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtensile_multicard_train.py
141 lines (101 loc) · 4.21 KB
/
tensile_multicard_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import os
import sys
import argparse
import shutil
import multiprocessing
from multiprocessing import Pool
from .hardware_detector import HardwareDetector
from .yaml_parse import YamlParse
from .common import LOG_DIR, WORK_DIR, CONFIG_DIR, LOGIC_DIR, OUTPUT_DIR, printExit, printWarning
from .output_struct import DirStructBuilder
deviceList = []
logic_dir_list = []
def run(command):
print("[Process %s], command: %s" % (os.getpid(), command))
os.system(command)
def multiRun(tensile_root, conf_file_list, deviceList):
train_command = "python " + tensile_root + "/Tensile/bin/Tensile"
pool = Pool(len(conf_file_list))
print("fork %s subprocess to train..." % len(conf_file_list))
for idx in range(len(conf_file_list)):
device_idx = deviceList[idx]
config_file = conf_file_list[idx]
log_dir = LOG_DIR + "/" + str(device_idx) + ".log"
logic_dir = LOGIC_DIR + "/" + str(device_idx) + "/"
logic_dir_list.append(logic_dir + "3_LibraryLogic")
command_line = "HIP_VISIBLE_DEVICES=" + str(device_idx) + " " + train_command + " " + config_file + " " + logic_dir + " > " + log_dir
pool.apply_async(run, [command_line])
pool.close()
pool.join()
print("multi process done.")
def mergeLogicFile(tensile_path):
print("merge start...")
merge_script = "python " + tensile_path + "/Tensile/Utilities/merge_rocblas_yaml_files.py"
print("all the logic files as below:")
print(logic_dir_list)
if 0 == len(logic_dir_list):
print("No file need to merge!")
return
final_dir = LOGIC_DIR + "/final"
os.mkdir(final_dir)
if 1 == len(logic_dir_list):
os.system("cp %s/* %s" % (logic_dir_list[0], final_dir))
return
tmp_dir = LOGIC_DIR + "/tmp"
os.mkdir(tmp_dir)
os.system("cp %s/* %s" % (logic_dir_list[0], tmp_dir))
orig_dir = tmp_dir
for i in range(1, len(logic_dir_list)):
new_dir = LOGIC_DIR + "/tmp%s" % i
os.mkdir(new_dir)
command = "%s %s %s %s" % (merge_script, orig_dir, logic_dir_list[i], new_dir)
print(command)
os.system(command)
orig_dir = new_dir
os.system("cp %s/* %s" % (orig_dir, final_dir))
print("merge done.")
def Train(usrArgs):
argParser = argparse.ArgumentParser()
argParser.add_argument("config_path", help="config.yaml file path")
argParser.add_argument("output_path", help="output file path, default is folder output in the current folder.")
argParser.add_argument("tensile_path", help="root path of tensile")
argParser.add_argument("-g", "--gpu", type = int, dest='gpu', \
help = "gpu idx which used, will use all the valid gpu default", \
required = False, nargs = '*')
args = argParser.parse_args()
hardware = HardwareDetector()
if not args.gpu:
args.gpu = [0]
deviceList = list(set(hardware.gpu).intersection(set(args.gpu)))
if 0 == len(deviceList):
print("No GPU available or you choose the invalid gpu!")
return
print("%s gpu will be used. devices list : %s" % (len(deviceList), deviceList))
if args.output_path != "":
OUTPUT_DIR = args.output_path
# if just use 1 gpu, the process will follow the default flows:
if 1 == len(deviceList):
tensile_path = args.tensile_path + "/Tensile/bin/Tensile"
command = "HIP_VISIBLE_DEVICES={} python {} {} {}".format(deviceList[0], tensile_path, args.config_path, OUTPUT_DIR)
print(command)
os.system(command)
return
# multia process start...
if os.path.exists(WORK_DIR):
shutil.rmtree(WORK_DIR)
os.mkdir(WORK_DIR)
os.mkdir(CONFIG_DIR)
os.mkdir(LOG_DIR)
os.mkdir(LOGIC_DIR)
try:
parse = YamlParse(args.config_path, deviceList)
except (ValueError, IOError):
printExit("[Train] yaml parse failed!")
else:
multiRun(args.tensile_path, parse.run(), deviceList)
mergeLogicFile(args.tensile_path)
DirStructBuilder()
def main():
Train(sys.argv[1:])
if __name__ == "__main__":
printExit("This is can no longer be run as script. Run 'tensile_multicard_train/bin/train' instead.")