 ca82244928
			
		
	
	
		ca82244928
		
	
	
	
	
		
			
			We are going to reuse the tesseract OCR code. Create a new tesseract_ocr() helper and use it. Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org> Message-Id: <20201021105035.2477784-5-f4bug@amsat.org> Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
		
			
				
	
	
		
			47 lines
		
	
	
		
			1.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			47 lines
		
	
	
		
			1.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # ...
 | |
| #
 | |
| # Copyright (c) 2019 Philippe Mathieu-Daudé <f4bug@amsat.org>
 | |
| #
 | |
| # This work is licensed under the terms of the GNU GPL, version 2 or
 | |
| # later. See the COPYING file in the top-level directory.
 | |
| 
 | |
| import re
 | |
| import logging
 | |
| 
 | |
| from avocado.utils import process
 | |
| from avocado.utils.path import find_command, CmdNotFoundError
 | |
| 
 | |
| def tesseract_available(expected_version):
 | |
|     try:
 | |
|         find_command('tesseract')
 | |
|     except CmdNotFoundError:
 | |
|         return False
 | |
|     res = process.run('tesseract --version')
 | |
|     try:
 | |
|         version = res.stdout_text.split()[1]
 | |
|     except IndexError:
 | |
|         version = res.stderr_text.split()[1]
 | |
|     return int(version.split('.')[0]) == expected_version
 | |
| 
 | |
|     match = re.match(r'tesseract\s(\d)', res)
 | |
|     if match is None:
 | |
|         return False
 | |
|     # now this is guaranteed to be a digit
 | |
|     return int(match.groups()[0]) == expected_version
 | |
| 
 | |
| 
 | |
| def tesseract_ocr(image_path, tesseract_args='', tesseract_version=3):
 | |
|     console_logger = logging.getLogger('tesseract')
 | |
|     console_logger.debug(image_path)
 | |
|     if tesseract_version == 4:
 | |
|         tesseract_args += ' --oem 1'
 | |
|     proc = process.run("tesseract {} {} stdout".format(tesseract_args,
 | |
|                                                        image_path))
 | |
|     lines = []
 | |
|     for line in proc.stdout_text.split('\n'):
 | |
|         sline = line.strip()
 | |
|         if len(sline):
 | |
|             console_logger.debug(sline)
 | |
|             lines += [sline]
 | |
|     return lines
 |