"인문지식 처리와 프로그래밍2020 4.02"의 두 판 사이의 차이
soook
(→parser01.py) |
(→parser02.py) |
||
| 37번째 줄: | 37번째 줄: | ||
</pre> | </pre> | ||
| − | ==parser02.py== | + | ==parser02.py(탭단위 세분화)== |
<pre> | <pre> | ||
#!/usr/bin/python | #!/usr/bin/python | ||
| 81번째 줄: | 81번째 줄: | ||
</pre> | </pre> | ||
| − | |||
| − | |||
| − | |||
| − | |||
==parser03.py(엑셀자료->'''lst파일->국문/영문 파싱''')== | ==parser03.py(엑셀자료->'''lst파일->국문/영문 파싱''')== | ||
2020년 4월 9일 (목) 20:13 판
parser01.py(대단위)
#!/usr/bin/python
#-*- coding: utf-8 -*-
import sys
def main():
try:
filename = sys.argv[1]
except:
return
list = filename+'.lst'
f = open( list, 'r', encoding='utf-8')
while 1:
line = f.readline()
if not line: break
text = line.strip()
parsed = text.split(sep='\t')
i = 0
for x in parsed:
if( i == 0): print( x )
else: print( '\t\t{0}'.format(x) )
i = i+1
f.close()
main()
parser02.py(탭단위 세분화)
#!/usr/bin/python
#-*- coding: utf-8 -*-
import sys
def main():
try:
filename = sys.argv[1]
task = sys.argv[2]
except:
print( '\nparser02.py [file name] [task: 1, 2]' )
return
list = filename+'.lst'
f = open( list, 'r', encoding='utf-8')
while 1:
line = f.readline()
if not line: break
text = line.strip()
parsed = text.split(sep='\t')
if( task == '1' ):
title = parsed[7]
else :
if( task == '2' ):
title = parsed[9]
else:
title=''
print( '{0}\t{1}_{2}\t{3}-{4}-{5}\t{6}'.format(parsed[0], parsed[2], parsed[3], parsed[4], parsed[5], parsed[6], title) )
f.close()
main()
parser03.py(엑셀자료->lst파일->국문/영문 파싱)
#!/usr/bin/python
#-*- coding: utf-8 -*-
import sys
def main():
try:
filename = sys.argv[1]
task = sys.argv[2]
except:
print( '\nparser03 [file name] [task: 1, 2]' )
return
list = filename+'.lst'
f = open( list, 'r', encoding='utf-8')
while 1:
line = f.readline()
if not line: break
text = line.strip()
parsed = text.split(sep='\t')
if( task == '1' ):
title = parsed[7]
else :
if( task == '2' ):
title = parsed[9]
else:
title=''
title = title.replace(',', '')
title = title.replace('(', '')
title = title.replace(')', '')
title = title.strip()
word = title.split(sep=' ')
for x in word:
print( '{0}\t{1}_{2}\t{3}-{4}-{5}\t{6}'.format(parsed[0], parsed[2], parsed[3], parsed[4], parsed[5], parsed[6], x) )
f.close()
main()