• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python base.BaseDataset类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中pySPACE.resources.dataset_defs.base.BaseDataset的典型用法代码示例。如果您正苦于以下问题:Python BaseDataset类的具体用法?Python BaseDataset怎么用?Python BaseDataset使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了BaseDataset类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: _merge_pickle_files

 def _merge_pickle_files(self, target_collection_path, source_collection_pathes,
                               train_set_name_suffix, target_collection_params):
     """ Merge all collections in source_collection_pathes and store them \
         in the target collection"""
     
     # load a first collection, in which the data of all other collections 
     # is assembled
     target_collection = BaseDataset.load(source_collection_pathes[0])
     try:
         author = pwd.getpwuid(os.getuid())[4]
     except:
         author = "unknown"
         self._log("Author could not be resolved.",level=logging.WARNING)
     date = time.strftime("%Y%m%d_%H_%M_%S")
     # Delete node_chain file name
     try:
         target_collection.meta_data.pop("node_chain_file_name")
     except:
         pass
     # Update meta data and store it
     k = "test" if self.reverse else "train"
     target_collection_params["__INPUT_DATASET__"][k] = \
              [s_c_p.split(os.sep)[-2] for s_c_p in source_collection_pathes]
     target_collection_params["__RESULT_DIRECTORY__"] = self.result_directory
     target_collection.meta_data.update({
             "author" : author, 
             "date" : date, 
             "dataset_directory" : target_collection_path,
             "train_test" : True,
             "parameter_setting" : target_collection_params,
             "input_collection_name" : source_collection_pathes[0][len(
                                     pySPACE.configuration.storage):]
     })
   
     # merge data of all other collections to target collection
     for source_collection_path in source_collection_pathes[1:]:
         source_collection = BaseDataset.load(source_collection_path)
         for run in source_collection.get_run_numbers():
             for split in source_collection.get_split_numbers():
                 data = source_collection.get_data(run, split, 
                                                       train_set_name_suffix)
                 target_data = target_collection.get_data(run, split, 
                                                       train_set_name_suffix)
                 # actual data is stored in a list that has to be extended
                 target_data.extend(data)
                 
     # if only test data was given, the "Rest_vs" collection is stored as 
     # training data
     if not self.reverse and "test" == train_set_name_suffix: 
         # exchange the "test" in key tuple to "train" before storing
         for key in target_collection.data.keys():
             assert("test" == key[2])
             value = target_collection.data.pop(key)
             key = (key[0],key[1],"train")
             target_collection.data[key] = value
                 
     target_collection.store(target_collection_path)
开发者ID:BioinformaticsArchive,项目名称:pyspace,代码行数:57,代码来源:merge.py


示例2: store

    def store(self, result_dir, s_format = "None"):
        if not s_format == "None":
            self._log("The format %s is not supported!"%s_format, level=logging.CRITICAL)
            return
        # Update the meta data
        author = get_author()
        self.update_meta_data({"type": "only output of individual nodes stored",
                                      "storage_format": s_format,
                                      "author" : author,
                                      "data_pattern": "no data stored"})

        # Store meta data
        BaseDataset.store_meta_data(result_dir,self.meta_data)
开发者ID:MMKrell,项目名称:pyspace,代码行数:13,代码来源:dummy.py


示例3: __init__

    def __init__(self, dataset_dir, command_template, parametrization,
                 run_number, split_number, operation_result_dir,
                 hide_parameters = []):
        
        super(WEKAFilterProcess, self).__init__()
        
        # Determine the directory in which the of the process' results
        # are stored
        result_collection_name = dataset_dir.split(os.sep)[-2]
        for parameter_name, parameter_value in parametrization.iteritems():
            # If this is a parameter that should not be hidden, then we have to
            # encode it in the result collection name 
            if not parameter_name in hide_parameters:
                result_collection_name += "{__%s__:%s}" % (parameter_name.upper(),
                                                           parameter_value)
                                                                     
        self.result_directory = os.path.join(operation_result_dir,
                                             result_collection_name)
        
        # Create directory for intermediate results if it does not exist yet
        create_directory(self.result_directory 
                              + os.sep + "data_run%s" % run_number)
                
        # Create collection
        collection = BaseDataset.load(dataset_dir)
        
        # The parametrization that is independent of the collection type 
        # and the specific weka command template that is executed
        self.params = {"dataset_name": dataset_dir.replace('/','_'),
                       "dataset_dir": dataset_dir,
                       "run_number": run_number,
                       "split_number": split_number,
                       "weka_class_path": pySPACE.configuration.weka_class_path,
                       "temp_results": self.result_directory}

        # Load the abbreviations
        abbreviations_file = open(os.path.join(pySPACE.configuration.spec_dir,
                                               'operations/weka_templates',
                                               'abbreviations.yaml'), 'r')
        self.abbreviations = yaml.load(abbreviations_file)
        # Add custom parameters for the weka command template
        for parameter_name, parameter_value in parametrization.iteritems():
            # Auto-expand abbreviations
            if parameter_value in self.abbreviations:
                parameter_value = self.abbreviations[parameter_value]
            self.params[parameter_name] = parameter_value
            
        # Build the WEKA command by repeatedly replacing all placeholders in 
        # the template 
        while True:
            instantiated_template = command_template % self.params
            if instantiated_template == command_template:
                # All placeholders replace 
                self.weka_command = instantiated_template
                break
            else:
                # We have to continue since we are not converged
                command_template = instantiated_template
        
        self.handler_class = None
开发者ID:Crespo911,项目名称:pyspace,代码行数:60,代码来源:weka_filter.py


示例4: _createProcesses

 def _createProcesses(cls, processes, result_directory, operation_spec, 
             parameter_settings, input_collections, command_template):     
  
     # For each combination of classifier, input-collection and
     # run number, create one WEKA_process
     for dataset_dir in input_collections:
         collection = BaseDataset.load(dataset_dir)
         # Determine the number of iterations and splits to be used
         iterations = collection.meta_data["runs"]
         splits = collection.meta_data["splits"] 
         if "runs" in operation_spec:
             assert(iterations in [1, operation_spec["runs"]])
             iterations = operation_spec["runs"]
         if "cv_folds" in operation_spec:
             assert(splits in [1, operation_spec["cv_folds"]])
             splits = operation_spec["cv_folds"]          
         
         for parametrization in parameter_settings: 
             for run_number in range(iterations):
                 process = WEKAClassificationProcess(dataset_dir,
                                                     command_template,
                                                     parametrization,
                                                     splits,
                                                     run_number,
                                                     result_directory)
                 processes.put(process)
     # give executing process the sign that creation is now finished                
     processes.put(False)
开发者ID:AlexanderFabisch,项目名称:pyspace,代码行数:28,代码来源:weka_classification.py


示例5: _copy_file

 def _copy_file(self, source_collection_path, target_collection_path,
                train_set_name_suffix):
     """ Copy a dataset to a new destination 
     
     **Parameters**
     
         :source_collection_path:
             The path to the dataset that has to be copied.
             
         :target_collection_path:
             The path to where the dataset should be copied.
             
         :train_set_name_suffix:
             Either 'train' or 'test'. Specifies if the target dataset is
             handeled as training or testing data. 
     """ 
     source_collection = BaseDataset.load(source_collection_path)
     # if only test data was given, the "Rest_vs" collection is stored as 
     # training data
     if self.reverse and "test" == train_set_name_suffix: 
         # exchange the "test" in key tuple to "train" before storing
         for key in source_collection.data.keys():
             assert("test" == key[2])
             value = source_collection.data.pop(key)
             key = (key[0],key[1],"train")
             source_collection.data[key] = value
     # we store the data in the same format as before
     source_collection.store(target_collection_path, 
         source_collection.meta_data["storage_format"])
开发者ID:MMKrell,项目名称:pyspace,代码行数:29,代码来源:merge.py


示例6: store

    def store(self, result_dir, s_format = "None"):
        if not s_format == "None":
            self._log("The format %s is not supported!"%s_format, level=logging.CRITICAL)
            return
        # Update the meta data
        try:
            author = pwd.getpwuid(os.getuid())[4]
        except:
            author = "unknown"
            self._log("Author could not be resolved.",level=logging.WARNING)
        self.update_meta_data({"type": "only output of individual nodes stored",
                                      "storage_format": s_format,
                                      "author" : author,
                                      "data_pattern": "no data stored"})

        # Store meta data
        BaseDataset.store_meta_data(result_dir,self.meta_data)
开发者ID:AlexanderFabisch,项目名称:pyspace,代码行数:17,代码来源:dummy.py


示例7: create

    def create(cls, operation_spec, result_directory, debug=False, input_paths=[]):
        """
        A factory method that creates an WEKA operation based on the 
        information given in the operation specification operation_spec
        """
        assert(operation_spec["type"] == "weka_classification")
        # Determine all parameter combinations that should be tested
        parameter_settings = cls._get_parameter_space(operation_spec)
        
        # Read the command template from a file
        template_file = open(os.path.join(pySPACE.configuration.spec_dir,
                                               "operations",
                                               "weka_templates",
                                               operation_spec["template"]),
                             'r')
        command_template = template_file.read()
        template_file.close() 

        # number of processes
        if "runs" in operation_spec:
            number_processes = len(input_paths) * len(parameter_settings) * \
                           operation_spec["runs"]
        else: # approximate the number of processes 
            runs = []
            for dataset_dir in input_paths:
                collection = BaseDataset.load(dataset_dir)
                runs.append(collection.meta_data["runs"])
            runs = max(runs)
            number_processes = len(input_paths) * len(parameter_settings) * \
                               runs
        
        if debug == True:
            # To better debug creation of processes we don't limit the queue 
            # and create all processes before executing them
            processes = processing.Queue()
            cls._createProcesses(processes, result_directory, operation_spec, 
                                 parameter_settings, input_paths,
                                 command_template)
            # create and return the weka operation object
            return cls(processes, operation_spec, result_directory, 
                       number_processes)
        else:
            # Create all processes by calling a recursive helper method in 
            # another thread so that already created processes can be executed in 
            # parallel. Therefore a queue is used which size is maximized to 
            # guarantee that not to much objects are created (because this costs
            # memory). However, the actual number of 100 is arbitrary and might
            # be reviewed.
            processes = processing.Queue(100)
            create_process = processing.Process(target=cls._createProcesses,
                             args=( processes, result_directory, operation_spec, 
                                    parameter_settings, input_paths,
                                    command_template))
            create_process.start()            
            # create and return the weka operation object
            return cls(processes, operation_spec, result_directory, 
                       number_processes, create_process)        
开发者ID:AlexanderFabisch,项目名称:pyspace,代码行数:57,代码来源:weka_classification.py


示例8: _get_result_dataset_dir

    def _get_result_dataset_dir(base_dir, input_dataset_dir,
                                   parameter_setting, hide_parameters):
        """ Determines the name of the result directory

        Determines the name of the result directory based on the
        input_dataset_dir, the node_chain_name and the parameter setting.
        """
        input_name = input_dataset_dir.strip(os.sep).split(os.sep)[-1]
        input_name = input_name.strip("{}")
        # If the input is already the result of an operation
        if input_name.count("}{") > 0:
            input_name_parts = input_name.split("}{")
            input_name = input_name_parts[0]

        # Load the input meta data
        dataset_dir = os.sep.join([pySPACE.configuration.storage,
                                                input_dataset_dir])
        dataset_md = BaseDataset.load_meta_data(dataset_dir)

        # We are going to change the parameter_setting and don't want to
        # interfere with later runs so we work on a copy
        parameter_setting = copy.deepcopy(parameter_setting)

        # Ignore pseudo parameter "__PREPARE_OPERATION__"
        if "__PREPARE_OPERATION__" in parameter_setting:
            parameter_setting.pop("__PREPARE_OPERATION__")

        # Add the input parameters meta data to the given parameter setting
        if "parameter_setting" in dataset_md:
            parameter_setting.update(dataset_md["parameter_setting"])

        # We have to remove ' characters from the parameter value since
        # Weka does ignore them
        for key, value in parameter_setting.iteritems():
            if isinstance(value, basestring) and value.count("'") > 1:
                parameter_setting[key] = eval(value)

        # Determine the result_directory name
        # String between Key and value changed from ":" to "#",
        # because ot problems in windows and with windows file servers
        parameter_str = "}{".join(("%s#%s" % (key, value))
                                        for key, value in parameter_setting.iteritems()
                                            if key not in hide_parameters)

        result_name =  "{%s}" % input_name

        if parameter_str != "":
            result_name += "{%s}" % (parameter_str)

        # Determine the path where this result will be stored
        # and create the directory if necessary
        result_dir = base_dir
        result_dir += os.sep + result_name
        create_directory(result_dir)

        return result_dir
开发者ID:AlexanderFabisch,项目名称:pyspace,代码行数:56,代码来源:node_chain.py


示例9: __call__

    def __call__(self):
        """ Executes this process on the respective modality """
        # Restore configuration
        pySPACE.configuration = self.configuration

        # reduce log_level for processing a second time and
        # set communication possibility for nodes to backend
        pySPACE.configuration.min_log_level = self.min_log_level
        pySPACE.configuration.logging_com = self.handler_args
        pySPACE.configuration.backend_com = self.backend_com

        ############## Prepare benchmarking ##############
        super(NodeChainProcess, self).pre_benchmarking()

        # Load the data and check that it can be processed
        # Note: This can not be done in the objects constructor since in
        # that case the whole input would need to be pickled
        # when doing the remote call
        abs_dataset_dir = os.sep.join([self.storage,
                                          self.rel_dataset_dir])

        input_collection = BaseDataset.load(abs_dataset_dir)

        # We have to remember parameters used for generating this specific
        # input dataset
        if 'parameter_setting' in input_collection.meta_data.keys():
            # but not __INPUT_DATASET__ and __RESULT_DIRECTORY__
            for k, v in input_collection.meta_data['parameter_setting'].items():
                if k not in ["__INPUT_DATASET__", "__RESULT_DIRECTORY__"]:
                    self.parameter_setting[k] = v

        NodeChainProcess._check_node_chain_dataset_consistency(self.node_chain,
                                                       input_collection)

        ############## Do the actual benchmarking ##############

        self._log("Start benchmarking run %s of node_chain %s on dataset %s"
                                % (self.run,
                                   self.node_chain_spec,
                                   self.rel_dataset_dir))


        # Do the actual benchmarking for this collection/node_chain combination
        try:
            result_collection = \
                self.node_chain.benchmark(input_collection = input_collection,
                                         run = self.run,
                                         persistency_directory = self.persistency_dir,
                                         store_node_chain = self.store_node_chain)
        except Exception, exception:
            # Send Exception to Logger
            import traceback
            print traceback.format_exc()
            self._log(traceback.format_exc(), level = logging.ERROR)
            raise
开发者ID:AlexanderFabisch,项目名称:pyspace,代码行数:55,代码来源:node_chain.py


示例10: _copy_pickle_file

 def _copy_pickle_file(self, source_collection_path, target_collection_path,
                       train_set_name_suffix):
     
     source_collection = BaseDataset.load(source_collection_path)
     # if only test data was given, the "Rest_vs" collection is stored as 
     # training data
     if self.reverse and "test" == train_set_name_suffix: 
         # exchange the "test" in key tuple to "train" before storing
         for key in source_collection.data.keys():
             assert("test" == key[2])
             value = source_collection.data.pop(key)
             key = (key[0],key[1],"train")
             source_collection.data[key] = value
     source_collection.store(target_collection_path)
开发者ID:BioinformaticsArchive,项目名称:pyspace,代码行数:14,代码来源:merge.py


示例11: create

    def create(cls, operation_spec, result_directory, debug=False, input_paths=[]):
        """ A factory method that creates an Analysis operation based on the 
        information given in the operation specification operation_spec
        """
        assert(operation_spec["type"] == "analysis")
        input_path = operation_spec["input_path"]
        summary = BaseDataset.load(os.path.join(pySPACE.configuration.storage,
                                      input_path))
        data_dict = summary.data

        # Determine the parameters that should be analyzed
        parameters = operation_spec["parameters"]
        
        # Determine the metrics that should be plotted
        metrics = operation_spec["metrics"]
        
        # Determine how many processes will be created
        number_parameter_values = [len(set(data_dict[param])) for param in parameters]
        number_processes = cls._numberOfProcesses(0, number_parameter_values)+1
        
        if debug == True:
            # To better debug creation of processes we don't limit the queue 
            # and create all processes before executing them
            processes = processing.Queue()
            cls._createProcesses(processes, result_directory, data_dict, parameters, 
                                   metrics, True)
            return cls( processes, operation_spec, result_directory, number_processes)
        else:
            # Create all plot processes by calling a recursive helper method in 
            # another thread so that already created processes can be executed
            # although creation of processes is not finished yet. Therefore a queue 
            # is used which size is limited to guarantee that not to much objects 
            # are created (since this costs memory). However, the actual number 
            # of 100 is arbitrary and might be changed according to the system at hand.
            processes = processing.Queue(100)
            create_process = processing.Process(target=cls._createProcesses,
                             args=( processes, result_directory, data_dict, 
                                    parameters, metrics, True))
            create_process.start()
            # create and return the operation object
            return cls( processes, operation_spec, result_directory, number_processes, create_process)        
开发者ID:AlexanderFabisch,项目名称:pyspace,代码行数:41,代码来源:analysis.py


示例12: test_time_series_storing

    def test_time_series_storing(self):

        if os.path.exists('tmp') is False :
            os.makedirs('tmp')
        
        source = SimpleTimeSeriesSourceNode()
        sink = TimeSeriesSinkNode()
        sink.register_input_node(source)
        sink.set_run_number(0)
        sink.process_current_split()
        result_collection = sink.get_result_dataset()
        result_collection.store('tmp')
        #sink.store_results("test_time_series_storing.tmp")
        
        reloaded_collection = BaseDataset.load('tmp')
        
        reloader = TimeSeriesSourceNode()
        reloader.set_input_dataset(reloaded_collection)
        #set_permanent_attributes(time_series_file = "test_time_series_storing.tmp")
        
        orig_data = list(source.request_data_for_testing()) 
        restored_data = list(reloader.request_data_for_testing())
        
        # Check that the two list have the same length
        self.assertEqual(len(orig_data), len(restored_data),
                         "Numbers of time series before storing and after reloading are not equal!")
        
        # Check that there is a one-to-one correspondence
        for orig_datapoint, orig_label in orig_data:
            found = False
            for restored_datapoint, restored_label in restored_data:
                found |= (orig_datapoint.view(numpy.ndarray) == restored_datapoint.view(numpy.ndarray)).all() \
                            and (orig_label == restored_label)
                if found: break
            self.assert_(found, 
                         "One of the original time series cannot not be found after reloading")
        
        shutil.rmtree('tmp') # Cleaning up... 
开发者ID:Crespo911,项目名称:pyspace,代码行数:38,代码来源:test_time_series_sink.py


示例13: create

 def create(cls, operation_spec, result_directory, debug=False, input_paths=[]):
     """
     A factory method that creates a statistic operation based on the
     information given in the operation specification operation_spec.
     If debug is TRUE the creation of the statistic processes will not
     be in a separated thread.
     """
     assert(operation_spec["type"] == "statistic")
     input_path = operation_spec["input_path"]
     tabular = BaseDataset.load(os.path.join(pySPACE.configuration.storage, input_path)).data
     
     if operation_spec.has_key("filter"):
         conditions= csv_analysis.empty_dict(tabular)
         for key,l in operation_spec["filter"].items():
             conditions[key].extend(l)
         tabular = csv_analysis.strip_dict(tabular,conditions)
     metric = operation_spec.get("metric","Balanced_accuracy")
     parameter = operation_spec.get("parameter","__Dataset__")
     rel_par = operation_spec.get("related_parameters",["__Dataset__", "Key_Run", "Key_Fold"])
     average = operation_spec.get("average",None)
     
     if average in rel_par:
         rel_par.remove(average)
     if metric in rel_par:
         rel_par.remove(metric)
     if parameter in rel_par:
         rel_par.remove(parameter)
         
     reduced_tabular=cls.reduce_tabular(tabular,rel_par,metric,parameter,average)
     number_processes = 1
     processes = processing.Queue()
     cls._createProcesses(processes, result_directory, reduced_tabular)
     
     import shutil
     shutil.copy2(os.path.join(pySPACE.configuration.storage, input_path,"results.csv"), os.path.join(result_directory,"results.csv"))
     shutil.copy2(os.path.join(pySPACE.configuration.storage, input_path,"metadata.yaml"), os.path.join(result_directory,"metadata.yaml"))
     # create and return the shuffle operation object
     return cls(processes, operation_spec, result_directory, number_processes)
开发者ID:AlexanderFabisch,项目名称:pyspace,代码行数:38,代码来源:statistic.py


示例14: __call__


#.........这里部分代码省略.........
             
                # Determine names of the original data sets the input 
                # datasets are based on
                base_dataset1 = dataset_name1.strip("}{").split("}{")[0]
                base_dataset2 = dataset_name2.strip("}{").split("}{")[0]
                
                # Determine target dataset name and create directory
                # for it
                mixed_base_dataset = "%s_vs_%s" % (base_dataset1, 
                                                      base_dataset2)
                target_dataset_name = dataset_name1.replace(base_dataset1,
                                                                  mixed_base_dataset)
                
                target_dataset_dir = os.sep.join([self.result_directory,
                                                     target_dataset_name])
                
                create_directory(os.sep.join([target_dataset_dir, "data_run0"]))
                
                if splitted:
                    # For each split, copy the train data from dataset 1 and
                    # the test data from dataset 2 to the target dataset
                    for source_train_file_name in glob.glob(os.sep.join([dataset_dir1,
                                                                       "data_run0",
                                                                       "*_sp*_train.*"])):
                        # TODO: We have $n$ train sets and $n$ test sets, we                   "metadata.yaml"])),
                              
                        #       could use all $n*n$ combinations 
                        target_train_file_name = source_train_file_name.replace(dataset_dir1,
                                                                                target_dataset_dir)
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_train_file_name, 
                                                 target_train_file_name,
                                                 base_dataset1,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_train_file_name, 
                                       target_train_file_name)
                        
                        source_test_file_name = source_train_file_name.replace(dataset_dir1,
                                                                               dataset_dir2)
                        
                        source_test_file_name =  source_test_file_name.replace("train.",
                                                                                "test.")
                        target_test_file_name = target_train_file_name.replace("train.",
                                                                                "test.")
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_test_file_name, 
                                                 target_test_file_name,
                                                 base_dataset2,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_test_file_name,
                                       target_test_file_name)
                else:
                    # Use the data set from dataset 1 as training set and 
                    # the data set from dataset 2 as test data
                    for source_train_file_name in glob.glob(os.sep.join([dataset_dir1,
                                                                         "data_run0",
                                                                         "*_sp*_test.*"])):
                        target_train_file_name = source_train_file_name.replace("test.",
                                                                                "train.")
                        target_train_file_name = target_train_file_name.replace(dataset_dir1,
                                                                                target_dataset_dir)
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_train_file_name, 
                                                 target_train_file_name,
                                                 base_dataset1,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_train_file_name, 
                                       target_train_file_name)
                        
                        source_test_file_name = source_train_file_name.replace(dataset_dir1,
                                                                               dataset_dir2)
                        
                        target_test_file_name = target_train_file_name.replace("train.",
                                                                                "test.")
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_test_file_name, 
                                                 target_test_file_name,
                                                 base_dataset2,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_test_file_name,
                                       target_test_file_name)
                # Write metadata.yaml based on input meta data
                input_dataset1_meta = BaseDataset.load_meta_data(dataset_dir1)

                output_dataset_meta = dict(input_dataset1_meta)
                output_dataset_meta['train_test'] = True
                output_dataset_meta['date'] = time.strftime("%Y%m%d_%H_%M_%S")
                try:
                    output_dataset_meta['author'] = pwd.getpwuid(os.getuid())[4]
                except :
                    self._log("Author could not be resolved.",level=logging.WARNING)
                    output_dataset_meta['author'] = "unknown"
                BaseDataset.store_meta_data(target_dataset_dir,output_dataset_meta)
        
        ############## Clean up after benchmarking ##############
        super(ShuffleProcess, self).post_benchmarking()
开发者ID:AlexanderFabisch,项目名称:pyspace,代码行数:101,代码来源:shuffle.py


示例15: prepare_training


#.........这里部分代码省略.........
                                                             flow_spec = file(self.potentials[key]["prewindowing_flow"]))
                self.node_chains[key][0].set_generator(flow_generator(key))
                flow = open(self.potentials[key]["prewindowing_flow"])
            elif self.operation == "prewindowed_train":
                if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True:
                    self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain,
                                                                     flow_spec = file(self.potentials[key]["postprocess_flow"]))
                    # create windower
                    online_logger.info( "Creating Windower")
                    online_logger.info(self.potentials[key]["windower_spec_path_train"])
                    self.node_chains[key][0].set_windower_spec_file(os.path.join(spec_base, "node_chains", "windower", self.potentials[key]["windower_spec_path_train"]))
                    replace_start_and_end_markers = True
                else:
                    self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain, flow_spec = file(self.potentials[key]["postprocess_flow"]))
                    replace_start_and_end_markers = False

                final_collection = TimeSeriesDataset()
                final_collection_path = os.path.join(self.prewindowed_data_directory, key, "all_train_data")
                # delete previous training collection
                if os.path.exists(final_collection_path):
                    online_logger.info("deleting old training data collection for " + key)
                    shutil.rmtree(final_collection_path)

                # load all prewindowed collections and
                # append data to the final collection
                prewindowed_sets = \
                    glob.glob(os.path.join(self.prewindowed_data_directory, key, "*"))
                if len(prewindowed_sets) == 0:
                    online_logger.error("Couldn't find data, please do prewindowing first!")
                    raise Exception
                online_logger.info("concatenating prewindowed data from " + str(prewindowed_sets))

                for s,d in enumerate(prewindowed_sets):
                    collection = BaseDataset.load(d)
                    data = collection.get_data(0, 0, "train")
                    for d,(sample,label) in enumerate(data):

                        if replace_start_and_end_markers:
                            # in case we concatenate multiple 'Window' labeled
                            # sets we have to remove every start- and endmarker
                            for k in sample.marker_name.keys():
                                # find '{S,s}  8' or '{S,s}  9'
                                m = re.match("^s\s{0,2}[8,9]{1}$", k, re.IGNORECASE)
                                if m is not None:
                                    online_logger.info(str("remove %s from %d %d" % (m.group(), s, d)))
                                    del(sample.marker_name[m.group()])

                            if s == len(prewindowed_sets)-1 and \
                                d == len(data)-1:
                                # insert endmarker
                                sample.marker_name["S  9"] = [0.0]
                                online_logger.info("added endmarker" + str(s) + " " + str(d))

                            if s == 0 and d == 0:
                                # insert startmarker
                                sample.marker_name["S  8"] = [0.0]
                                online_logger.info("added startmarker" + str(s) + " " + str(d))

                        final_collection.add_sample(sample, label, True)

                # save final collection (just for debugging)
                os.mkdir(final_collection_path)
                final_collection.store(final_collection_path)

                online_logger.info("stored final collection at " + final_collection_path)
开发者ID:AlexanderFabisch,项目名称:pyspace,代码行数:66,代码来源:trainer.py


示例16: consolidate

    def consolidate(self):
        """ Consolidates the results obtained by the single processes into a consistent structure
        of collections that are stored on the file system.
        """
        # Consolidate the results
        directory_pattern = os.sep.join([self.result_directory, "{*",])
        dataset_pathes = glob.glob(directory_pattern)

        # For all collections found
        for dataset_path in dataset_pathes:
            # Load their meta_data
            meta_data = BaseDataset.load_meta_data(dataset_path)

            # Determine author and date
            try:
                author = pwd.getpwuid(os.getuid())[4]
            except:
                author = "unknown"
                self._log("Author could not be resolved.",level=logging.WARNING)
            date = time.strftime("%Y%m%d_%H_%M_%S")

            # Update meta data and store it
            meta_data.update({"author" : author, "date" : date})
            BaseDataset.store_meta_data(dataset_path, meta_data)

            # Copy the input dataset specification file to the result
            # directory in order to make later analysis of
            # the results more easy
            input_meta_path = os.sep.join([pySPACE.configuration.storage,
                                          meta_data["input_collection_name"]])
            input_meta = BaseDataset.load_meta_data(input_meta_path)
            BaseDataset.store_meta_data(dataset_path,input_meta,
                                        file_name="input_metadata.yaml")
        # Check if some results consist of several runs
        # and update the meta data in this case
        # TODO: This is not a clean solution
        for dataset_dir in glob.glob(os.sep.join([self.result_directory,
                                                     "*"])):
            if not os.path.isdir(dataset_dir): continue
            # There can be either run dirs, persistency dirs, or both of them.
            # Check of whichever there are more. If both exist, their numbers
            # are supposed to be equal.
            nr_run_dirs = len(glob.glob(os.sep.join([dataset_dir,
                                              "data_run*"])))
            nr_per_dirs = len(glob.glob(os.sep.join([dataset_dir,
                                              "persistency_run*"])))
            nr_runs = max(nr_run_dirs, nr_per_dirs)

            if nr_runs > 1:
                collection_meta = BaseDataset.load_meta_data(dataset_dir)
                collection_meta["runs"] = nr_runs
                BaseDataset.store_meta_data(dataset_dir,collection_meta)
        # If we don't create a feature vector or time series collection,
        # we evaluated our classification using a classification performance sink.
        # The resulting files should be merged to one csv tabular.
        pathlist = glob.glob(os.path.join(self.result_directory,"results_*"))
        if len(pathlist)>0:
            # Do the consolidation the same way as for WekaClassificationOperation
            self._log("Consolidating results ...")
            # We load and store the results once into a PerformanceResultSummary
            # This does the necessary consolidation...
            self._log("Reading intermediate results...")
            result_collection = PerformanceResultSummary(dataset_dir=self.result_directory)
            self._log("done")
            self._log("Storing result collection")
            result_collection.store(self.result_directory)
            self._log("done")
            PerformanceResultSummary.merge_traces(self.result_directory)

            if not(self.compression == False):
                # Since we get one result summary,
                # we don't need the numerous folders.
                # So we zip them to make the whole folder more easy visible.
                import zipfile
                cwd=os.getcwd()
                os.chdir(self.result_directory)
                # If there are to many or to large folders, problems may occur.
                # This case we want to log, try 64 bit mode, and then skip the zipping.
                try:
                    pathlist = glob.glob(os.path.join(self.result_directory,"{*}"))
                    
                    if not self.compression == "delete":                        
              

鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python filesystem.create_directory函数代码示例发布时间:2022-05-25
下一篇:
Python time_series.TimeSeries类代码示例发布时间:2022-05-25
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap