인문지식 처리와 프로그래밍2020 4.02
soook
parser01.py(대단위)
#!/usr/bin/python
#-*- coding: utf-8 -*-
import sys
def main():
try:
filename = sys.argv[1]
except:
return
list = filename+'.lst'
f = open( list, 'r', encoding='utf-8')
while 1:
line = f.readline()
if not line: break
text = line.strip()
parsed = text.split(sep='\t')
i = 0
for x in parsed:
if( i == 0): print( x )
else: print( '\t\t{0}'.format(x) )
i = i+1
f.close()
main()
parser02.py(탭단위 세분화)
#!/usr/bin/python
#-*- coding: utf-8 -*-
import sys
def main():
try:
filename = sys.argv[1]
task = sys.argv[2]
except:
print( '\nparser02.py [file name] [task: 1, 2]' )
return
list = filename+'.lst'
f = open( list, 'r', encoding='utf-8')
while 1:
line = f.readline()
if not line: break
text = line.strip()
parsed = text.split(sep='\t')
if( task == '1' ):
title = parsed[7]
else :
if( task == '2' ):
title = parsed[9]
else:
title=''
print( '{0}\t{1}_{2}\t{3}-{4}-{5}\t{6}'.format(parsed[0], parsed[2], parsed[3], parsed[4], parsed[5], parsed[6], title) )
f.close()
main()
parser03.py(엑셀자료->lst파일->국문/영문 파싱)
#!/usr/bin/python
#-*- coding: utf-8 -*-
import sys
def main():
try:
filename = sys.argv[1]
task = sys.argv[2]
except:
print( '\nparser03 [file name] [task: 1, 2]' )
return
list = filename+'.lst'
f = open( list, 'r', encoding='utf-8')
while 1:
line = f.readline()
if not line: break
text = line.strip()
parsed = text.split(sep='\t')
if( task == '1' ):
title = parsed[7]
else :
if( task == '2' ):
title = parsed[9]
else:
title=''
title = title.replace(',', '')
title = title.replace('(', '')
title = title.replace(')', '')
title = title.strip()
word = title.split(sep=' ')
for x in word:
print( '{0}\t{1}_{2}\t{3}-{4}-{5}\t{6}'.format(parsed[0], parsed[2], parsed[3], parsed[4], parsed[5], parsed[6], x) )
f.close()
main()