인문지식 처리와 프로그래밍2020 4.16

soook
Soook (토론 | 기여) 사용자의 2020년 4월 22일 (수) 22:42 판 (새 문서: <pre> #!/usr/bin/python import sys import os import urllib.request import shutil address = "http://www.heritage.go.kr/heri/cul/culSelectDetail.do?ccbaCpno=" #문화재청 국가...)

(차이) ← 이전 판 | 최신판 (차이) | 다음 판 → (차이)
이동: 둘러보기, 검색

#!/usr/bin/python

import sys
import os
import urllib.request
import shutil

address = "http://www.heritage.go.kr/heri/cul/culSelectDetail.do?ccbaCpno="  #문화재청 국가문화유산포털문화재
	
def downImage( document, sequence, urlstring ):

	checkline = urlstring.upper()
	begin = checkline.find("HTTP:")
	end = checkline.find(".JPG")+4
	url = urlstring[begin:end]
	
	filename=document[:-4]
	suffix = "{:03}.jpg".format(sequence)
	path = "{0}-{1}".format(filename, suffix)
	
	try:
		urllib.request.urlretrieve(url, path)
	except:
		shutil.copy2('noimage.jpg', path)


def findImage( path ):

	g = open( path, 'rt', encoding='UTF8')
	
	found = 0
		
	while 1:
		line = g.readline()
		if not line: break	
		checkline = line.upper()
		if( checkline.find(".JPG") != -1 ):
			whatIfound = line.strip()
			downImage( path, found, whatIfound )
			# found += 1
			return
			
	if found == 0:
		print( "{0}: No Image".format(path))
		path = "{0}-000.jpg".format(path[:-4])
		shutil.copy2('noimage.jpg', path)		
	g.close()
	
def downHtml( folder, file ):

	url = address.__add__(file)
	path = folder + '/' + file + '.htm'

	try:
		urllib.request.urlretrieve(url, path)
	except:
		print( "{0}: Invalid Cpno!".format( file ) )
		return
	
	findImage( path )

def main():

	try:
		filename = sys.argv[1]
	except:
		return
		
	list = filename.__add__( ".lst" )
	folder = filename

	try:
		os.makedirs( folder )
	except OSError:
		pass
		
	i = 0			
	f = open( list )

	while 1:
		line = f.readline()
		if not line: break
		file = line.strip()
		print( "{0}: Processing....".format(file))
		downHtml( folder, file )
		
	f.close()

main()