인문지식 처리와 프로그래밍2020 4.16
soook
Soook (토론 | 기여) 사용자의 2020년 4월 22일 (수) 22:42 판 (새 문서: <pre> #!/usr/bin/python import sys import os import urllib.request import shutil address = "http://www.heritage.go.kr/heri/cul/culSelectDetail.do?ccbaCpno=" #문화재청 국가...)
#!/usr/bin/python
import sys
import os
import urllib.request
import shutil
address = "http://www.heritage.go.kr/heri/cul/culSelectDetail.do?ccbaCpno=" #문화재청 국가문화유산포털문화재
def downImage( document, sequence, urlstring ):
checkline = urlstring.upper()
begin = checkline.find("HTTP:")
end = checkline.find(".JPG")+4
url = urlstring[begin:end]
filename=document[:-4]
suffix = "{:03}.jpg".format(sequence)
path = "{0}-{1}".format(filename, suffix)
try:
urllib.request.urlretrieve(url, path)
except:
shutil.copy2('noimage.jpg', path)
def findImage( path ):
g = open( path, 'rt', encoding='UTF8')
found = 0
while 1:
line = g.readline()
if not line: break
checkline = line.upper()
if( checkline.find(".JPG") != -1 ):
whatIfound = line.strip()
downImage( path, found, whatIfound )
# found += 1
return
if found == 0:
print( "{0}: No Image".format(path))
path = "{0}-000.jpg".format(path[:-4])
shutil.copy2('noimage.jpg', path)
g.close()
def downHtml( folder, file ):
url = address.__add__(file)
path = folder + '/' + file + '.htm'
try:
urllib.request.urlretrieve(url, path)
except:
print( "{0}: Invalid Cpno!".format( file ) )
return
findImage( path )
def main():
try:
filename = sys.argv[1]
except:
return
list = filename.__add__( ".lst" )
folder = filename
try:
os.makedirs( folder )
except OSError:
pass
i = 0
f = open( list )
while 1:
line = f.readline()
if not line: break
file = line.strip()
print( "{0}: Processing....".format(file))
downHtml( folder, file )
f.close()
main()